diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2752df7e68..a1f89a6682 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.##
 
 cmake_minimum_required(VERSION 3.0.0)
 
@@ -107,8 +107,9 @@ option (ENABLE_UPPERCASE_API "export APIs with uppercase" OFF)
 option (ENABLE_COMPLEX_RETURN_INTEL "Enable complex_return_intel" OFF)
 option (ENABLE_TRSM_PREINVERSION "Enable TRSM preinversion" ON)
 option (ENABLE_AOCL_DYNAMIC "Enable Dynamic Multi-threading" OFF)
-option(DISABLE_BLIS_ARCH_TYPE "Disable BLIS_ARCH_TYPE functionality" OFF)
+option(DISABLE_BLIS_ARCH_TYPE "Disable BLIS_ARCH_TYPE and BLIS_MODEL_TYPE functionality" OFF)
 option(RENAME_BLIS_ARCH_TYPE "Rename BLIS_ARCH_TYPE env var renamed to supplied value" BLIS_ARCH_TYPE)
+option(RENAME_BLIS_MODEL_TYPE "Rename BLIS_MODEL_TYPE env var renamed to supplied value" BLIS_MODEL_TYPE)
 
 if (${AOCL_BLIS_FAMILY} STREQUAL "amdzen")
   set(REF_KERNEL_MIRRORING_PY "${CMAKE_SOURCE_DIR}/build/blis_ref_kernel_mirror.py")
@@ -181,9 +182,11 @@ endif ()
 if (ENABLE_JRIR_RR)
     message("Round robin thread method enabled")
     set(BLIS_ENABLE_JRIR_RR TRUE)
+    set(BLIS_ENABLE_JRIR_SLAB FALSE)
 elseif (ENABLE_JRIR_SLAB)
     message("SLAB thread method enabled")
     set(BLIS_ENABLE_JRIR_SLAB TRUE)
+    set(BLIS_ENABLE_JRIR_RR FALSE)
 else ()
     message("Unsupported method of thread partitioning in jr and ir loops")
 endif ()
@@ -202,18 +205,23 @@ endif ()
 
 if (ENABLE_BLAS)
     add_definitions(-DBLIS_ENABLE_BLAS)
+    set(BLIS_ENABLE_BLAS TRUE)
 else ()
     add_definitions(-DBLIS_DISABLE_BLAS)
+    set(BLIS_ENABLE_BLAS FALSE)
 endif ()
 
 if (ENABLE_CBLAS)
     add_definitions(-DBLIS_ENABLE_CBLAS)
+    set(BLIS_ENABLE_CBLAS TRUE)
     if (NOT ENABLE_BLAS)
         # Force BLAS layer when CBLAS is enabled
         add_definitions(-DBLIS_ENABLE_BLAS)
+        set(BLIS_ENABLE_BLAS TRUE)
     endif ()
 else ()
     add_definitions(-DBLIS_DISABLE_CBLAS)
+    set(BLIS_ENABLE_CBLAS FALSE)
 endif ()
 
 if (ENABLE_BLASTEST)
@@ -286,8 +294,10 @@ endif()
 
 if(DISABLE_BLIS_ARCH_TYPE)
     set(BLIS_DISABLE_BLIS_ARCH_TYPE TRUE)
+    set(BLIS_DISABLE_BLIS_MODEL_TYPE TRUE)
 else()
     set(BLIS_DISABLE_BLIS_ARCH_TYPE FALSE)
+    set(BLIS_DISABLE_BLIS_MODEL_TYPE FALSE)
 endif()
 
 if(RENAME_BLIS_ARCH_TYPE)
@@ -298,6 +308,30 @@ else()
     set(rename_blis_arch_type "BLIS_ARCH_TYPE")
 endif()
 
+if(RENAME_BLIS_MODEL_TYPE)
+    set(__blis_model_type_name TRUE)
+    set(rename_blis_model_type "${RENAME_BLIS_MODEL_TYPE}")
+else()
+    set(__blis_model_type_name TRUE)
+    set(rename_blis_model_type "BLIS_MODEL_TYPE")
+endif()
+
+find_package(Doxygen)
+set(W_DIR "${CMAKE_CURRENT_SOURCE_DIR}/docs")
+if(NOT (DOXYGEN_FOUND))
+  message(STATUS "Doxygen not found please install and try again.")
+else()
+  execute_process(COMMAND doxygen Doxyfile
+      WORKING_DIRECTORY ${W_DIR}
+      COMMAND_ECHO STDOUT)
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/docs/html/index.html)
+  message(STATUS "Documentation generated successfully, to view documentation open docs/html/index.html .")
+else()
+  message(STATUS "Document generation failed.")
+endif()
+
+set(CMAKE_BUILD_TYPE ${CMAKE_CONFIGURATION_TYPES})
 
 #print configurations
 message("---cmake configurations---")
@@ -322,8 +356,9 @@ message(BLIS_ENABLE_SANDBOX : ${BLIS_ENABLE_SANDBOX})
 message(BLIS_ENABLE_SHARED : ${BLIS_ENABLE_SHARED})
 message(DISABLE_BLIS_ARCH_TYPE : ${DISABLE_BLIS_ARCH_TYPE})
 message(RENAME_BLIS_ARCH_TYPE : ${RENAME_BLIS_ARCH_TYPE})
+message(RENAME_BLIS_MODEL_TYPE : ${RENAME_BLIS_MODEL_TYPE})
 
-SET(ENABLE_SIMD_FLAGS "AVX2" CACHE STRING "Set compiler SIMD flags")
+SET(ENABLE_SIMD_FLAGS "none" CACHE STRING "Set compiler SIMD flags")
 SET_PROPERTY(CACHE ENABLE_SIMD_FLAGS PROPERTY STRINGS none SSE2 AVX AVX2)
 
 if(${ENABLE_SIMD_FLAGS} MATCHES "AVX2")
@@ -334,15 +369,6 @@ elseif(${ENABLE_SIMD_FLAGS} MATCHES "SSE2")
   add_definitions(/arch:SSE2)
 endif()
 
-if(${TARGET_ARCH} STREQUAL zen4 OR 
-   ${TARGET_ARCH} STREQUAL amdzen)
-  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/1/bli_amaxv_zen_int_avx512.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
-  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
-  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
-  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_dgemm_skx_asm_16x14.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
-  set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c PROPERTIES COMPILE_FLAGS /arch:AVX512)
-endif()
-
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W0 ")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Oi")
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /MP")
@@ -588,10 +614,37 @@ set(BLIS_VERSION_STRING ${BLIS_VERSION})
 string(TIMESTAMP BUILD_DATE "%Y%m%d")
 add_definitions(-DBLIS_VERSION_STRING="AOCL-BLIS ${BLIS_VERSION_STRING} Build ${BUILD_DATE}")
 
+# Set object libraries created in kernels directory to be added into BLIS library.
+set(OBJECT_LIBRARIES 
+    $<TARGET_OBJECTS:zen_1>
+    $<TARGET_OBJECTS:zen_1f>
+    $<TARGET_OBJECTS:zen_2>
+    $<TARGET_OBJECTS:zen_3>
+    $<TARGET_OBJECTS:zen_3_sup>
+    $<TARGET_OBJECTS:haswell_1m>
+    $<TARGET_OBJECTS:haswell_3>
+    $<TARGET_OBJECTS:haswell_3sup>
+    $<TARGET_OBJECTS:haswell_3supd6x8>
+)
+# Ammend the list of object libraries to include zen4 paths as appropriate.
+if(${TARGET_ARCH} STREQUAL zen4 OR 
+   ${TARGET_ARCH} STREQUAL amdzen)
+   set(OBJECT_LIBRARIES ${OBJECT_LIBRARIES}
+      $<TARGET_OBJECTS:zen4_1>
+      $<TARGET_OBJECTS:zen4_1m>
+      $<TARGET_OBJECTS:zen4_3>
+      $<TARGET_OBJECTS:zen4_3sup>
+      $<TARGET_OBJECTS:zen4_3supd24x8>
+      $<TARGET_OBJECTS:skx_3>
+   )
+endif()
+
 if(BUILD_SHARED_LIBS)
     add_library("${PROJECT_NAME}" SHARED ${CMAKE_SOURCE_DIR}/bli_config.h
                          ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h
-                         ${headers})
+                         ${headers} 
+                         ${OBJECT_LIBRARIES}
+                         )
     if(ENABLE_OPENMP)
         target_link_libraries("${PROJECT_NAME}" PRIVATE OpenMP::OpenMP_CXX)
     endif()
@@ -601,7 +654,9 @@ endif()
 if(NOT BUILD_SHARED_LIBS)
     add_library("${PROJECT_NAME}" STATIC ${CMAKE_SOURCE_DIR}/bli_config.h
                          ${CMAKE_SOURCE_DIR}/include/${TARGET_ARCH}/blis.h
-                         ${headers})
+                         ${headers} 
+                         ${OBJECT_LIBRARIES}
+                         )
     if(ENABLE_OPENMP)
         set_target_properties("${PROJECT_NAME}" PROPERTIES LINKER_LANGUAGE C OUTPUT_NAME "${LIB_NAME}" STATIC_LIBRARY_OPTIONS "${OpenMP_libomp_LIBRARY}")
     else()
diff --git a/Makefile b/Makefile
index 1f86acc7e5..0a1a4646ad 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -273,9 +273,11 @@ BASE_OBJ_CBLAS_PATH := $(BASE_OBJ_FRAME_PATH)/compat/cblas
 ifeq ($(MK_ENABLE_CBLAS),no)
 MK_BLIS_OBJS        := $(filter-out $(BASE_OBJ_CBLAS_PATH)/%.o, $(MK_BLIS_OBJS) )
 endif
-ifeq ($(MK_ENABLE_BLAS),no)
-MK_BLIS_OBJS        := $(filter-out $(BASE_OBJ_BLAS_PATH)/%.o,  $(MK_BLIS_OBJS) )
-endif
+# Include bla_ files so that we get the *_blis_impl interfaces. Actual BLAS
+# interfaces will not be included from these files when MK_ENABLE_BLAS is no.
+##ifeq ($(MK_ENABLE_BLAS),no)
+##MK_BLIS_OBJS        := $(filter-out $(BASE_OBJ_BLAS_PATH)/%.o,  $(MK_BLIS_OBJS) )
+##endif
 
 
 
diff --git a/README.md b/README.md
index 28179306c7..ce923198e4 100644
--- a/README.md
+++ b/README.md
@@ -1,714 +1,11 @@
-![The BLIS cat is sleeping.](http://www.cs.utexas.edu/users/field/blis_cat.png)
+# AOCL-BLAS library
 
-[![Build Status](https://travis-ci.org/flame/blis.svg?branch=master)](https://travis-ci.org/flame/blis)
-[![Build Status](https://ci.appveyor.com/api/projects/status/github/flame/blis?branch=master&svg=true)](https://ci.appveyor.com/project/shpc/blis/branch/master)
+AOCL-BLAS is AMD's optimized version of BLAS targeted for AMD EPYC and Ryzen CPUs. It is developed as a forked version of BLIS (https://github.com/flame/blis), which is developed by members of the [Science of High-Performance Computing](http://shpc.oden.utexas.edu/) (SHPC) group in the [Institute for Computational Engineering and Sciences](https://www.oden.utexas.edu/) at [The University of Texas at Austin](https://www.utexas.edu/) and other collaborators (including AMD). All known features and functionalities of BLIS are retained and supported in AOCL-BLAS library. AOCL-BLAS is regularly updated with the improvements from the upstream repository.
 
-Contents
---------
+AOCL BLAS is optimized with SSE2, AVX2, AVX512 instruction sets which would be enabled based on the target Zen architecture using the dynamic dispatch feature. All prominent Level 3, Level 2 and Level 1 APIs are designed and optimized for specific paths targeting different size spectrums e.g., Small, Medium and Large sizes. These algorithms are designed and customized to exploit the architectural improvements of the target platform.
 
-* **[Introduction](#introduction)**
-* **[Education and Learning](#education-and-learning)**
-* **[What's New](#whats-new)**
-* **[What People Are Saying About BLIS](#what-people-are-saying-about-blis)**
-* **[Key Features](#key-features)**
-* **[How to Download BLIS](#how-to-download-blis)**
-* **[Getting Started](#getting-started)**
-* **[Documentation](#documentation)**
-* **[External Packages](#external-packages)**
-* **[Discussion](#discussion)**
-* **[Contributing](#contributing)**
-* **[Citations](#citations)**
-* **[Funding](#funding)**
+For detailed instructions on how to configure, build, install, and link against AOCL-BLAS on AMD CPUs, please refer to the AOCL User Guide located on AMD developer [portal](https://www.amd.com/en/developer/aocl.html).
 
-Introduction
-------------
-
-BLIS is a portable software framework for instantiating high-performance
-BLAS-like dense linear algebra libraries. The framework was designed to isolate
-essential kernels of computation that, when optimized, immediately enable
-optimized implementations of most of its commonly used and computationally
-intensive operations. BLIS is written in [ISO
-C99](http://en.wikipedia.org/wiki/C99) and available under a
-[new/modified/3-clause BSD
-license](http://opensource.org/licenses/BSD-3-Clause). While BLIS exports a
-[new BLAS-like API](docs/BLISTypedAPI.md),
-it also includes a BLAS compatibility layer which gives application developers
-access to BLIS implementations via traditional [BLAS routine
-calls](http://www.netlib.org/lapack/lug/node145.html).
-An [object-based API](docs/BLISObjectAPI.md) unique to BLIS is also available.
-
-For a thorough presentation of our framework, please read our
-[ACM Transactions on Mathematical Software (TOMS)](https://toms.acm.org/)
-journal article, ["BLIS: A Framework for Rapidly Instantiating BLAS
-Functionality"](http://dl.acm.org/authorize?N91172).
-For those who just want an executive summary, please see the
-[Key Features](#key-features) section below.
-
-In a follow-up article (also in [ACM TOMS](https://toms.acm.org/)),
-["The BLIS Framework: Experiments in
-Portability"](http://dl.acm.org/authorize?N16240),
-we investigate using BLIS to instantiate level-3 BLAS implementations on a
-variety of general-purpose, low-power, and multicore architectures.
-
-An IPDPS'14 conference paper titled ["Anatomy of High-Performance Many-Threaded
-Matrix
-Multiplication"](http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf)
-systematically explores the opportunities for parallelism within the five loops
-that BLIS exposes in its matrix multiplication algorithm.
-
-For other papers related to BLIS, please see the
-[Citations section](#citations) below.
-
-It is our belief that BLIS offers substantial benefits in productivity when
-compared to conventional approaches to developing BLAS libraries, as well as a
-much-needed refinement of the BLAS interface, and thus constitutes a major
-advance in dense linear algebra computation. While BLIS remains a
-work-in-progress, we are excited to continue its development and further
-cultivate its use within the community.
-
-The BLIS framework is primarily developed and maintained by individuals in the
-[Science of High-Performance Computing](http://shpc.ices.utexas.edu/)
-(SHPC) group in the
-[Oden Institute for Computational Engineering and Sciences](https://www.oden.utexas.edu/)
-at [The University of Texas at Austin](https://www.utexas.edu/).
-Please visit the [SHPC](http://shpc.ices.utexas.edu/) website for more
-information about our research group, such as a list of
-[people](http://shpc.ices.utexas.edu/people.html)
-and [collaborators](http://shpc.ices.utexas.edu/collaborators.html),
-[funding sources](http://shpc.ices.utexas.edu/funding.html),
-[publications](http://shpc.ices.utexas.edu/publications.html),
-and [other educational projects](http://www.ulaff.net/) (such as MOOCs).
-
-Education and Learning
-----------------------
-
-Want to understand what's under the hood?
-Many of the same concepts and principles employed when developing BLIS are
-introduced and taught in a basic pedagogical setting as part of
-[LAFF-On Programming for High Performance (LAFF-On-PfHP)](http://www.ulaff.net/),
-one of several massive open online courses (MOOCs) in the
-[Linear Algebra: Foundations to Frontiers](http://www.ulaff.net/) series,
-all of which are available for free via the [edX platform](http://www.edx.org/).
-
-What's New
-----------
-
- * **Multithreaded small/skinny matrix support for sgemm now available!** Thanks to
-funding and hardware support from Oracle, we have now accelerated `gemm` for
-single-precision real matrix problems where one or two dimensions is exceedingly
-small. This work is similar to the `gemm` optimization announced last year.
-For now, we have only gathered performance results on an AMD Epyc Zen2 system, but
-we hope to publish additional graphs for other architectures in the future. You may
-find these Zen2 graphs via the [PerformanceSmall](docs/PerformanceSmall.md) document.
-
- * **BLIS awarded SIAM Activity Group on Supercomputing Best Paper Prize for 2020!**
-We are thrilled to announce that the paper that we internally refer to as the
-second BLIS paper,
-
-   "The BLIS Framework: Experiments in Portability." Field G. Van Zee, Tyler Smith, Bryan Marker, Tze Meng Low, Robert A. van de Geijn, Francisco Igual, Mikhail Smelyanskiy, Xianyi Zhang, Michael Kistler, Vernon Austel, John A. Gunnels, Lee Killough. ACM Transactions on Mathematical Software (TOMS), 42(2):12:1--12:19, 2016.
-
-   was selected for the [SIAM Activity Group on Supercomputing Best Paper Prize](https://www.siam.org/prizes-recognition/activity-group-prizes/detail/siag-sc-best-paper-prize)
-for 2020. The prize is awarded once every two years to a paper judged to be
-the most outstanding paper in the field of parallel scientific and engineering
-computing, and has only been awarded once before (in 2016) since its inception
-in 2015 (the committee did not award the prize in 2018). The prize
-[was awarded](https://www.oden.utexas.edu/about/news/ScienceHighPerfomanceComputingSIAMBestPaperPrize/)
-at the [2020 SIAM Conference on Parallel Processing for Scientific Computing](https://www.siam.org/conferences/cm/conference/pp20) in Seattle. Robert was present at
-the conference to give
-[a talk on BLIS](https://meetings.siam.org/sess/dsp_programsess.cfm?SESSIONCODE=68266) and accept the prize alongside other coauthors.
-The selection committee sought to recognize the paper, "which validates BLIS,
-a framework relying on the notion of microkernels that enables both productivity
-and high performance." Their statement continues, "The framework will continue
-having an important influence on the design and the instantiation of dense linear
-algebra libraries."
-
- * **Multithreaded small/skinny matrix support for dgemm now available!** Thanks to
-contributions made possible by our partnership with AMD, we have dramatically
-accelerated `gemm` for double-precision real matrix problems where one or two
-dimensions is exceedingly small. A natural byproduct of this optimization is
-that the traditional case of small _m = n = k_ (i.e. square matrices) is also
-accelerated, even though it was not targeted specifically. And though only
-`dgemm` was optimized for now, support for other datatypes and/or other operations
-may be implemented in the future. We've also added new graphs to the
-[PerformanceSmall](docs/PerformanceSmall.md) document to showcase multithreaded
-performance when one or more matrix dimensions are small.
-
- * **Performance comparisons now available!** We recently measured the
-performance of various level-3 operations on a variety of hardware architectures,
-as implemented within BLIS and other BLAS libraries for all four of the standard
-floating-point datatypes. The results speak for themselves! Check out our
-extensive performance graphs and background info in our new
-[Performance](docs/Performance.md) document.
-
- * **BLIS is now in Debian Unstable!** Thanks to Debian developer-maintainers
-[M. Zhou](https://github.com/cdluminate) and
-[Nico Schlömer](https://github.com/nschloe) for sponsoring our package in Debian.
-Their participation, contributions, and advocacy were key to getting BLIS into
-the second-most popular Linux distribution (behind Ubuntu, which Debian packages
-feed into). The Debian tracker page may be found
-[here](https://tracker.debian.org/pkg/blis).
-
- * **BLIS now supports mixed-datatype gemm!** The `gemm` operation may now be
-executed on operands of mixed domains and/or mixed precisions. Any combination
-of storage datatype for A, B, and C is now supported, along with a separate
-computation precision that can differ from the storage precision of A and B.
-And even the 1m method now supports mixed-precision computation.
-For more details, please see our [ACM TOMS](https://toms.acm.org/) journal
-article submission ([current
-draft](http://www.cs.utexas.edu/users/flame/pubs/blis7_toms_rev0.pdf)).
-
- * **BLIS now implements the 1m method.** Let's face it: writing complex
-assembly `gemm` microkernels for a new architecture is never a priority--and
-now, it almost never needs to be. The 1m method leverages existing real domain
-`gemm` microkernels to implement all complex domain level-3 operations. For
-more details, please see our [ACM TOMS](https://toms.acm.org/) journal article
-submission ([current
-draft](http://www.cs.utexas.edu/users/flame/pubs/blis6_toms_rev2.pdf)).
-
-What People Are Saying About BLIS
----------------------------------
-
-*["I noticed a substantial increase in multithreaded performance on my own
-machine, which was extremely satisfying."](https://groups.google.com/d/msg/blis-discuss/8iu9B5KCxpA/uftpjgIsBwAJ)* ... *["[I was] happy it worked so well!"](https://groups.google.com/d/msg/blis-discuss/8iu9B5KCxpA/uftpjgIsBwAJ)* (Justin Shea)
-
-*["This is an awesome library."](https://github.com/flame/blis/issues/288#issuecomment-447488637)* ... *["I want to thank you and the blis team for your efforts."](https://github.com/flame/blis/issues/288#issuecomment-448074704)* ([@Lephar](https://github.com/Lephar))
-
-*["Any time somebody outside Intel beats MKL by a nontrivial amount, I report it to the MKL team. It is fantastic for any open-source project to get within 10% of MKL... [T]his is why Intel funds BLIS development."](https://github.com/flame/blis/issues/264#issuecomment-428673275)* ([@jeffhammond](https://github.com/jeffhammond))
-
-*["So BLIS is now a part of Elk."](https://github.com/flame/blis/issues/267#issuecomment-429303902)* ... *["We have found that zgemm applied to a 15000x15000 matrix with multi-threaded BLIS on a 32-core Ryzen 2990WX processor is about twice as fast as MKL"](https://github.com/flame/blis/issues/264#issuecomment-428373946)* ... *["I'm starting to like this a lot."](https://github.com/flame/blis/issues/264#issuecomment-428926191)* ([@jdk2016](https://github.com/jdk2016))
-
-*["I [found] BLIS because I was looking for BLAS operations on C-ordered arrays for NumPy. BLIS has that, but even better is the fact that it's developed in the open using a more modern language than Fortran."](https://github.com/flame/blis/issues/254#issuecomment-423838345)* ([@nschloe](https://github.com/nschloe))
-
-*["The specific reason to have BLIS included [in Linux distributions] is the KNL and SKX [AVX-512] BLAS support, which OpenBLAS doesn't have."](https://github.com/flame/blis/issues/210#issuecomment-393126303)* ([@loveshack](https://github.com/loveshack))
-
-*["All tests pass without errors on OpenBSD. Thanks!"](https://github.com/flame/blis/issues/202#issuecomment-389691543)* ([@ararslan](https://github.com/ararslan))
-
-*["Thank you very much for your great help!... Looking forward to benchmarking."](https://github.com/flame/blis/issues/180#issuecomment-375895449)* ([@mrader1248](https://github.com/mrader1248))
-
-*["Thanks for the beautiful work."](https://github.com/flame/blis/issues/163#issue-286575452)* ([@mmrmo](https://github.com/mmrmo))
-
-*["[M]y software currently uses BLIS for its BLAS interface..."](https://github.com/flame/blis/issues/129#issuecomment-302904805)* ([@ShadenSmith](https://github.com/ShadenSmith))
-
-*["[T]hanks so much for your work on this! Excited to test."](https://github.com/flame/blis/issues/129#issuecomment-341565071)* ... *["[On AMD Excavator], BLIS is competitive to / slightly faster than OpenBLAS for dgemms in my tests."](https://github.com/flame/blis/issues/129#issuecomment-341608673)* ([@iotamudelta](https://github.com/iotamudelta))
-
-*["BLIS provided the only viable option on KNL, whose ecosystem is at present dominated by blackbox toolchains. Thanks again. Keep on this great work."](https://github.com/flame/blis/issues/116#issuecomment-281225101)* ([@heroxbd](https://github.com/heroxbd))
-
-*["I want to definitely try this out..."](https://github.com/flame/blis/issues/12#issuecomment-48086295)* ([@ViralBShah](https://github.com/ViralBShah))
-
-Key Features
-------------
-
-BLIS offers several advantages over traditional BLAS libraries:
-
- * **Portability that doesn't impede high performance.** Portability was a top
-priority of ours when creating BLIS. With virtually no additional effort on the
-part of the developer, BLIS is configurable as a fully-functional reference
-implementation. But more importantly, the framework identifies and isolates a
-key set of computational kernels which, when optimized, immediately and
-automatically optimize performance across virtually all level-2 and level-3
-BLIS operations. In this way, the framework acts as a productivity multiplier.
-And since the optimized (non-portable) code is compartmentalized within these
-few kernels, instantiating a high-performance BLIS library on a new
-architecture is a relatively straightforward endeavor.
-
- * **Generalized matrix storage.** The BLIS framework exports interfaces that
-allow one to specify both the row stride and column stride of a matrix. This
-allows one to compute with matrices stored in column-major order, row-major
-order, or by general stride. (This latter storage format is important for those
-seeking to implement tensor contractions on multidimensional arrays.)
-Furthermore, since BLIS tracks stride information for each matrix, operands of
-different storage formats can be used within the same operation invocation. By
-contrast, BLAS requires column-major storage. And while the CBLAS interface
-supports row-major storage, it does not allow mixing storage formats.
-
- * **Rich support for the complex domain.** BLIS operations are developed and
-expressed in their most general form, which is typically in the complex domain.
-These formulations then simplify elegantly down to the real domain, with
-conjugations becoming no-ops. Unlike the BLAS, all input operands in BLIS that
-allow transposition and conjugate-transposition also support conjugation
-(without transposition), which obviates the need for thread-unsafe workarounds.
-Also, where applicable, both complex symmetric and complex Hermitian forms are
-supported. (BLAS omits some complex symmetric operations, such as `symv`,
-`syr`, and `syr2`.) Another great example of BLIS serving as a portability
-lever is its implementation of the 1m method for complex matrix multiplication,
-a novel mechanism of providing high-performance complex level-3 operations using
-only real domain microkernels. This new innovation guarantees automatic level-3
-support in the complex domain even when the kernel developers entirely forgo
-writing complex kernels.
-
- * **Advanced multithreading support.** BLIS allows multiple levels of
-symmetric multithreading for nearly all level-3 operations. (Currently, users
-may choose to obtain parallelism via either OpenMP or POSIX threads). This
-means that matrices may be partitioned in multiple dimensions simultaneously to
-attain scalable, high-performance parallelism on multicore and many-core
-architectures. The key to this innovation is a thread-specific control tree
-infrastructure which encodes information about the logical thread topology and
-allows threads to query and communicate data amongst one another. BLIS also
-employs so-called "quadratic partitioning" when computing dimension sub-ranges
-for each thread, so that arbitrary diagonal offsets of structured matrices with
-unreferenced regions are taken into account to achieve proper load balance.
-More recently, BLIS introduced a runtime abstraction to specify parallelism on
-a per-call basis, which is useful for applications that want to handle most of
-the parallelism.
-
- * **Ease of use.** The BLIS framework, and the library of routines it
-generates, are easy to use for end users, experts, and vendors alike. An
-optional BLAS compatibility layer provides application developers with
-backwards compatibility to existing BLAS-dependent codes. Or, one may adjust or
-write their application to take advantage of new BLIS functionality (such as
-generalized storage formats or additional complex operations) by calling one
-of BLIS's native APIs directly. BLIS's typed API will feel familiar to many
-veterans of BLAS since these interfaces use BLAS-like calling sequences. And
-many will find BLIS's object-based APIs a delight to use when customizing
-or writing their own BLIS operations. (Objects are relatively lightweight
-`structs` and passed by address, which helps tame function calling overhead.)
-
- * **Multilayered API, exposed kernels, and sandboxes.** The BLIS framework
-exposes its
-implementations in various layers, allowing expert developers to access exactly
-the functionality desired. This layered interface includes that of the
-lowest-level kernels, for those who wish to bypass the bulk of the framework.
-Optimizations can occur at various levels, in part thanks to exposed packing
-and unpacking facilities, which by default are highly parameterized and
-flexible. And more recently, BLIS introduced sandboxes--a way to provide
-alternative implementations of `gemm` that do not use any more of the BLIS
-infrastructure than is desired. Sandboxes provide a convenient and
-straightforward way of modifying the `gemm` implementation without disrupting
-any other level-3 operation or any other part of the framework. This works
-especially well when the developer wants to experiment with new optimizations
-or try a different algorithm.
-
- * **Functionality that grows with the community's needs.** As its name
-suggests, the BLIS framework is not a single library or static API, but rather
-a nearly-complete template for instantiating high-performance BLAS-like
-libraries. Furthermore, the framework is extensible, allowing developers to
-leverage existing components to support new operations as they are identified.
-If such operations require new kernels for optimal efficiency, the framework
-and its APIs will be adjusted and extended accordingly.
-
- * **Code re-use.** Auto-generation approaches to achieving the aforementioned
-goals tend to quickly lead to code bloat due to the multiple dimensions of
-variation supported: operation (i.e. `gemm`, `herk`, `trmm`, etc.); parameter
-case (i.e. side, [conjugate-]transposition, upper/lower storage, unit/non-unit
-diagonal); datatype (i.e. single-/double-precision real/complex); matrix
-storage (i.e. row-major, column-major, generalized); and algorithm (i.e.
-partitioning path and kernel shape). These "brute force" approaches often
-consider and optimize each operation or case combination in isolation, which is
-less than ideal when the goal is to provide entire libraries. BLIS was designed
-to be a complete framework for implementing basic linear algebra operations,
-but supporting this vast amount of functionality in a manageable way required a
-holistic design that employed careful abstractions, layering, and recycling of
-generic (highly parameterized) codes, subject to the constraint that high
-performance remain attainable.
-
- * **A foundation for mixed domain and/or mixed precision operations.** BLIS
-was designed with the hope of one day allowing computation on real and complex
-operands within the same operation. Similarly, we wanted to allow mixing
-operands' numerical domains, floating-point precisions, or both domain and
-precision, and to optionally compute in a precision different than one or both
-operands' storage precisions. This feature has been implemented for the general
-matrix multiplication (`gemm`) operation, providing 128 different possible type
-combinations, which, when combined with existing transposition, conjugation,
-and storage parameters, enables 55,296 different `gemm` use cases. For more
-details, please see the documentation on [mixed datatype](docs/MixedDatatypes.md)
-support and/or our [ACM TOMS](https://toms.acm.org/) journal paper on
-mixed-domain/mixed-precision `gemm` ([linked below](#citations)).
-
-How to Download BLIS
---------------------
-
-There are a few ways to download BLIS. We list the most common four ways below.
-We **highly recommend** using either Option 1 or 2. Otherwise, we recommend
-Option 3 (over Option 4) so your compiler can perform optimizations specific
-to your hardware.
-
-1. **Download a source repository with `git clone`.**
-Generally speaking, we prefer using `git clone` to clone a `git` repository.
-Having a repository allows the user to periodically pull in the latest changes
-and quickly rebuild BLIS whenever they wish. Also, implicit in cloning a
-repository is that the repository defaults to using the `master` branch, which
-contains the latest "stable" commits since the most recent release. (This is
-in contrast to Option 3 in which the user is opting for code that may be
-slightly out of date.)
-
-   In order to clone a `git` repository of BLIS, please obtain a repository
-URL by clicking on the green button above the file/directory listing near the
-top of this page (as rendered by GitHub). Generally speaking, it will amount
-to executing the following command in your terminal shell:
-   ```
-   git clone https://github.com/flame/blis.git
-   ```
-
-2. **Download a source repository via a zip file.**
-If you are uncomfortable with using `git` but would still like the latest
-stable commits, we recommend that you download BLIS as a zip file.
-
-   In order to download a zip file of the BLIS source distribution, please
-click on the green button above the file listing near the top of this page.
-This should reveal a link for downloading the zip file.
-
-3. **Download a source release via a tarball/zip file.**
-Alternatively, if you would like to stick to the code that is included in
-official releases, you may download either a tarball or zip file of any of
-BLIS's previous [tagged releases](https://github.com/flame/blis/releases).
-We consider this option to be less than ideal for most people since it will
-likely mean you miss out on the latest bugfix or feature commits (in contrast
-to Options 1 or 2), and you also will not be able to update your code with a
-simple `git pull` command (in contrast to Option 1).
-
-4. **Download a binary package specific to your OS.**
-While we don't recommend this as the first choice for most users, we provide
-links to community members who generously maintain BLIS packages for various
-Linux distributions such as Debian Unstable and EPEL/Fedora. Please see the
-[External Packages](#external-packages) section below for more information.
-
-Getting Started
----------------
-
-*NOTE: This section assumes you've either cloned a BLIS source code repository
-via `git`, downloaded the latest source code via a zip file, or downloaded the
-source code for a tagged version release---Options 1, 2, or 3, respectively,
-as discussed in [the previous section](#how-to-download-blis).*
-
-If you just want to build a sequential (not parallelized) version of BLIS
-in a hurry and come back and explore other topics later, you can configure
-and build BLIS as follows:
-```
-$ ./configure auto
-$ make [-j]
-```
-You can then verify your build by running BLAS- and BLIS-specific test
-drivers via `make check`:
-```
-$ make check [-j]
-```
-And if you would like to install BLIS to the directory specified to `configure`
-via the `--prefix` option, run the `install` target:
-```
-$ make install
-```
-Please read the output of `./configure --help` for a full list of configure-time
-options.
-If/when you have time, we *strongly* encourage you to read the detailed
-walkthrough of the build system found in our [Build System](docs/BuildSystem.md)
-guide.
-
-Documentation
--------------
-
-We provide extensive documentation on the BLIS build system, APIs, test
-infrastructure, and other important topics. All documentation is formatted in
-markdown and included in the BLIS source distribution (usually in the `docs`
-directory). Slightly longer descriptions of each document may be found via in
-the project's [wiki](https://github.com/flame/blis/wiki) section.
-
-**Documents for everyone:**
-
- * **[Build System](docs/BuildSystem.md).** This document covers the basics of
-configuring and building BLIS libraries, as well as related topics.
-
- * **[Testsuite](docs/Testsuite.md).** This document describes how to run
-BLIS's highly parameterized and configurable test suite, as well as the
-included BLAS test drivers.
-
- * **[BLIS Typed API Reference](docs/BLISTypedAPI.md).** Here we document the
-so-called "typed" (or BLAS-like) API. This is the API that many users who are
-already familiar with the BLAS will likely want to use. You can find lots of
-example code for the typed API in the [examples/tapi](examples/tapi) directory
-included in the BLIS source distribution.
-
- * **[BLIS Object API Reference](docs/BLISObjectAPI.md).** Here we document
-the object API. This is API abstracts away properties of vectors and matrices
-within `obj_t` structs that can be queried with accessor functions. Many
-developers and experts prefer this API over the typed API. You can find lots of
-example code for the object API in the [examples/oapi](examples/oapi) directory
-included in the BLIS source distribution.
-
- * **[Hardware Support](docs/HardwareSupport.md).** This document maintains a
-table of supported microarchitectures.
-
- * **[Multithreading](docs/Multithreading.md).** This document describes how to
-use the multithreading features of BLIS.
-
- * **[Mixed-Datatypes](docs/MixedDatatypes.md).** This document provides an
-overview of BLIS's mixed-datatype functionality and provides a brief example
-of how to take advantage of this new code.
-
- * **[Performance](docs/Performance.md).** This document reports empirically
-measured performance of a representative set of level-3 operations on a variety
-of hardware architectures, as implemented within BLIS and other BLAS libraries
-for all four of the standard floating-point datatypes.
-
- * **[PerformanceSmall](docs/PerformanceSmall.md).** This document reports
-empirically measured performance of `gemm` on select hardware architectures
-within BLIS and other BLAS libraries when performing matrix problems where one
-or two dimensions is exceedingly small.
-
- * **[Release Notes](docs/ReleaseNotes.md).** This document tracks a summary of
-changes included with each new version of BLIS, along with contributor credits
-for key features.
-
- * **[Frequently Asked Questions](docs/FAQ.md).** If you have general questions
-about BLIS, please read this FAQ. If you can't find the answer to your question,
-please feel free to join the [blis-devel](https://groups.google.com/group/blis-devel)
-mailing list and post a question. We also have a
-[blis-discuss](https://groups.google.com/group/blis-discuss) mailing list that
-anyone can post to (even without joining).
-
-**Documents for github contributors:**
-
- * **[Contributing bug reports, feature requests, PRs, etc](CONTRIBUTING.md).**
-Interested in contributing to BLIS? Please read this document before getting
-started. It provides a general overview of how best to report bugs, propose new
-features, and offer code patches.
-
- * **[Coding Conventions](docs/CodingConventions.md).** If you are interested or
-planning on contributing code to BLIS, please read this document so that you can
-format your code in accordance with BLIS's standards.
-
-**Documents for BLIS developers:**
-
- * **[Kernels Guide](docs/KernelsHowTo.md).** If you would like to learn more
-about the types of kernels that BLIS exposes, their semantics, the operations
-that each kernel accelerates, and various implementation issues, please read
-this guide.
-
- * **[Configuration Guide](docs/ConfigurationHowTo.md).** If you would like to
-learn how to add new sub-configurations or configuration families, or are simply
-interested in learning how BLIS organizes its configurations and kernel sets,
-please read this thorough walkthrough of the configuration system.
-
- * **[Sandbox Guide](docs/Sandboxes.md).** If you are interested in learning
-about using sandboxes in BLIS--that is, providing alternative implementations
-of the `gemm` operation--please read this document.
-
-External Packages
------------------
-
-Generally speaking, we **highly recommend** building from source whenever
-possible using the latest `git` clone. (Tarballs of each
-[tagged release](https://github.com/flame/blis/releases) are also available, but
-we consider them to be less ideal since they are not as easy to upgrade as
-`git` clones.)
-
-That said, some users may prefer binary and/or source packages through their
-Linux distribution. Thanks to generous involvement/contributions from our
-community members, the following BLIS packages are now available:
-
- * **Debian**. [M. Zhou](https://github.com/cdluminate) has volunteered to
-sponsor and maintain BLIS packages within the Debian Linux distribution. The
-Debian package tracker can be found [here](https://tracker.debian.org/pkg/blis).
-(Also, thanks to [Nico Schlömer](https://github.com/nschloe) for previously
-volunteering his time to set up a standalone PPA.)
-
- * **Gentoo**. [M. Zhou](https://github.com/cdluminate) also maintains the
-[BLIS package](https://packages.gentoo.org/packages/sci-libs/blis) entry for
-[Gentoo](https://www.gentoo.org/), a Linux distribution known for its
-source-based [portage](https://wiki.gentoo.org/wiki/Portage) package manager
-and distribution system.
-
- * **EPEL/Fedora**. There are official BLIS packages in Fedora and EPEL (for
-RHEL7+ and compatible distributions) with versions for 64-bit integers, OpenMP,
-and pthreads, and shims which can be dynamically linked instead of reference
-BLAS. (NOTE: For architectures other than intel64, amd64, and maybe arm64, the
-performance of packaged BLIS will be low because it uses unoptimized generic
-kernels; for those architectures, [OpenBLAS](https://github.com/xianyi/OpenBLAS)
-may be a better solution.) [Dave
-Love](https://github.com/loveshack) provides additional packages for EPEL6 in a
-[Fedora Copr](https://copr.fedorainfracloud.org/coprs/loveshack/blis/), and
-possibly versions more recent than the official repo for other EPEL/Fedora
-releases. The source packages may build on other rpm-based distributions.
-
- * **OpenSuSE**. The copr referred to above has rpms for some OpenSuSE releases;
-the source rpms may build for others.
-
- * **GNU Guix**. Guix has BLIS packages, provides builds only for the generic
-target and some specific x86_64 micro-architectures.
-
- * **Conda**. conda channel [conda-forge](https://github.com/conda-forge/blis-feedstock)
-has Linux, OSX and Windows binary packages for x86_64.
-
-Discussion
-----------
-
-You can keep in touch with developers and other users of the project by joining
-one of the following mailing lists:
-
- * [blis-devel](https://groups.google.com/group/blis-devel): Please join and
-post to this mailing list if you are a BLIS developer, or if you are trying
-to use BLIS beyond simply linking to it as a BLAS library.
-**Note:** Most of the interesting discussions happen here; don't be afraid to
-join! If you would like to submit a bug report, or discuss a possible bug,
-please consider opening a [new issue](https://github.com/flame/blis/issues) on
-github.
-
- * [blis-discuss](https://groups.google.com/group/blis-discuss): Please join and
-post to this mailing list if you have general questions or feedback regarding
-BLIS. Application developers (end users) may wish to post here, unless they
-have bug reports, in which case they should open a
-[new issue](https://github.com/flame/blis/issues) on github.
-
-Contributing
-------------
-
-For information on how to contribute to our project, including preferred
-[coding conventions](docs/CodingConventions.md), please refer to the
-[CONTRIBUTING](CONTRIBUTING.md) file at the top-level of the BLIS source
-distribution.
-
-Citations
----------
-
-For those of you looking for the appropriate article to cite regarding BLIS, we
-recommend citing our
-[first ACM TOMS journal paper]( https://dl.acm.org/doi/10.1145/2764454?cid=81314495332)
-([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis1_toms_rev3.pdf)):
-
-```
-@article{BLIS1,
-   author      = {Field G. {V}an~{Z}ee and Robert A. {v}an~{d}e~{G}eijn},
-   title       = {{BLIS}: A Framework for Rapidly Instantiating {BLAS} Functionality},
-   journal     = {ACM Transactions on Mathematical Software},
-   volume      = {41},
-   number      = {3},
-   pages       = {14:1--14:33},
-   month       = {June},
-   year        = {2015},
-   issue_date  = {June 2015},
-   url         = {http://doi.acm.org/10.1145/2764454},
-}
-```
-
-You may also cite the
-[second ACM TOMS journal paper]( https://dl.acm.org/doi/10.1145/2755561?cid=81314495332)
-([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis2_toms_rev3.pdf)):
-
-```
-@article{BLIS2,
-   author      = {Field G. {V}an~{Z}ee and Tyler Smith and Francisco D. Igual and
-                  Mikhail Smelyanskiy and Xianyi Zhang and Michael Kistler and Vernon Austel and
-                  John Gunnels and Tze Meng Low and Bryan Marker and Lee Killough and
-                  Robert A. {v}an~{d}e~{G}eijn},
-   title       = {The {BLIS} Framework: Experiments in Portability},
-   journal     = {ACM Transactions on Mathematical Software},
-   volume      = {42},
-   number      = {2},
-   pages       = {12:1--12:19},
-   month       = {June},
-   year        = {2016},
-   issue_date  = {June 2016},
-   url         = {http://doi.acm.org/10.1145/2755561},
-}
-```
-
-We also have a third paper, submitted to IPDPS 2014, on achieving
-[multithreaded parallelism in BLIS](https://dl.acm.org/doi/10.1109/IPDPS.2014.110)
-([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf)):
-
-```
-@inproceedings{BLIS3,
-   author      = {Tyler M. Smith and Robert A. {v}an~{d}e~{G}eijn and Mikhail Smelyanskiy and
-                  Jeff R. Hammond and Field G. {V}an~{Z}ee},
-   title       = {Anatomy of High-Performance Many-Threaded Matrix Multiplication},
-   booktitle   = {28th IEEE International Parallel \& Distributed Processing Symposium
-                  (IPDPS 2014)},
-   year        = {2014},
-   url         = {https://doi.org/10.1109/IPDPS.2014.110},
-}
-```
-
-A fourth paper, submitted to ACM TOMS, also exists, which proposes an
-[analytical model](https://dl.acm.org/doi/10.1145/2925987)
-for determining blocksize parameters in BLIS
-([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf)):
-
-```
-@article{BLIS4,
-   author      = {Tze Meng Low and Francisco D. Igual and Tyler M. Smith and
-                  Enrique S. Quintana-Ort\'{\i}},
-   title       = {Analytical Modeling Is Enough for High-Performance {BLIS}},
-   journal     = {ACM Transactions on Mathematical Software},
-   volume      = {43},
-   number      = {2},
-   pages       = {12:1--12:18},
-   month       = {August},
-   year        = {2016},
-   issue_date  = {August 2016},
-   url         = {http://doi.acm.org/10.1145/2925987},
-}
-```
-
-A fifth paper, submitted to ACM TOMS, begins the study of so-called
-[induced methods for complex matrix multiplication]( https://dl.acm.org/doi/10.1145/3086466?cid=81314495332)
-([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis5_toms_rev2.pdf)):
-
-```
-@article{BLIS5,
-   author      = {Field G. {V}an~{Z}ee and Tyler Smith},
-   title       = {Implementing High-performance Complex Matrix Multiplication via the 3m and 4m Methods},
-   journal     = {ACM Transactions on Mathematical Software},
-   volume      = {44},
-   number      = {1},
-   pages       = {7:1--7:36},
-   month       = {July},
-   year        = {2017},
-   issue_date  = {July 2017},
-   url         = {http://doi.acm.org/10.1145/3086466},
-}
-```
-
-A sixth paper, submitted to ACM TOMS, revisits the topic of the previous
-article and derives a
-[superior induced method](https://epubs.siam.org/doi/10.1137/19M1282040)
-([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis6_sisc_rev3.pdf)):
-
-```
-@article{BLIS6,
-   author      = {Field G. {V}an~{Z}ee},
-   title       = {Implementing High-Performance Complex Matrix Multiplication via the 1m Method},
-   journal     = {SIAM Journal on Scientific Computing},
-   volume      = {42},
-   number      = {5},
-   pages       = {C221--C244},
-   month       = {September}
-   year        = {2020},
-   issue_date  = {September 2020},
-   url         = {https://doi.org/10.1137/19M1282040}
-}
-```
-
-A seventh paper, submitted to ACM TOMS, explores the implementation of `gemm` for
-[mixed-domain and/or mixed-precision](https://www.cs.utexas.edu/users/flame/pubs/blis7_toms_rev0.pdf) operands
-([unofficial backup link](https://www.cs.utexas.edu/users/flame/pubs/blis7_toms_rev0.pdf)):
-
-```
-@article{BLIS7,
-   author      = {Field G. {V}an~{Z}ee and Devangi N. Parikh and Robert A. van~de~{G}eijn},
-   title       = {Supporting Mixed-domain Mixed-precision Matrix Multiplication
-within the BLIS Framework},
-   journal     = {ACM Transactions on Mathematical Software},
-   note        = {submitted}
-}
-```
-
-Funding
--------
-
-This project and its associated research were partially sponsored by grants from
-[Microsoft](https://www.microsoft.com/),
-[Intel](https://www.intel.com/),
-[Texas Instruments](https://www.ti.com/),
-[AMD](https://www.amd.com/),
-[HPE](https://www.hpe.com/),
-[Oracle](https://www.oracle.com/),
-[Huawei](https://www.huawei.com/),
-and
-[Facebook](https://www.facebook.com/),
-as well as grants from the
-[National Science Foundation](https://www.nsf.gov/) (Awards
-CCF-0917167, ACI-1148125/1340293, CCF-1320112, and ACI-1550493).
-
-_Any opinions, findings and conclusions or recommendations expressed in this
-material are those of the author(s) and do not necessarily reflect the views of
-the National Science Foundation (NSF)._
+The upstream repository (https://github.com/flame/blis) contains further information on BLIS, including background information on BLIS design, usage examples, and a complete BLIS API reference.
 
+AOCL-BLAS is developed and maintained by AMD. You can contact us on the email-id toolchainsupport@amd.com. You can also raise any issue/suggestion on the git-hub repository at https://github.com/amd/blis/issues.
diff --git a/addon/aocl_gemm/aocl_gemm.h b/addon/aocl_gemm/aocl_gemm.h
index 4e971d932a..44de4ac658 100644
--- a/addon/aocl_gemm/aocl_gemm.h
+++ b/addon/aocl_gemm/aocl_gemm.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -37,5 +37,18 @@
 
 #include "aocl_gemm_post_ops.h"
 #include "aocl_gemm_interface_apis.h"
+#include "aocl_util_interface_apis.h"
+#include "aocl_bf16_type.h"
+#include "lpgemm_config.h"
+#include "lpgemm_post_ops.h"
+#include "lpgemm_kernels.h"
+#include "lpgemm_utils_kernels.h"
+#include "lpgemm_packb_bf16.h"
+#include "lpgemm_packb_s16.h"
+#include "lpgemm_packa.h"
+#include "lpgemm_packb.h"
+#include "lpgemm_packa_s8.h"
+#include "lpgemm_packb_s8.h"
+#include "lpgemm_packb_s8s16.h"
 
 #endif // BLIS_ADDON_LPGEMM
diff --git a/addon/aocl_gemm/aocl_gemm_bf16_utils.c b/addon/aocl_gemm/aocl_gemm_bf16_utils.c
index 7af08b751b..fd9d3be1f7 100644
--- a/addon/aocl_gemm/aocl_gemm_bf16_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_bf16_utils.c
@@ -1,126 +1,136 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "aocl_gemm_interface_apis.h"
-#include "lpgemm_types.h"
-#include "lpgemm_config.h"
-#include "lpgemm_utils.h"
-#include "lpgemm_reorder_bf16.h"
-
-AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
-{
-	if ( ( k <= 0 ) || ( n <= 0 ) )
-	{
-		return 0; // Error.
-	}
-
-	// Check if avx512_bf16 ISA is supported, lpgemm matmul only works with it.
-	if ( bli_cpuid_is_avx512_bf16_supported() == FALSE )
-	{
-		printf(" AVX512_BF16 ISA not supported by processor, cannot perform lpgemm.\n");
-		return 0; // Error.
-	}
-
-	/* Initialize BLIS. */
-	bli_init_auto();
-
-	// Set MC, NC, KC, NR, MR.
-	aocl_lpgemm_init_global_cntx();
-
-	AOCL_MATRIX_TYPE input_mat_type;
-	bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
-
-	if ( input_mat_type == A_MATRIX )
-	{
-		return 0; // A reorder not supported.
-	}
-
-	// Extra space since packing does width in multiples of 16. The bf16
-	// instruction can be used as long as atleast one zmm register can be fully
-	// loaded; and since k_dim needs to be atleast 2, having n_dim atleast 16
-	// should give 2x16=32 elements, enough for 1 zmm register.The padding is
-	// not rounded to NR (=64), since that would result in memory wastage.
-	dim_t n_reorder = make_multiple_of_n( n, 16 );
-
-	// Extra space since packing does length in multiples of 2.
-	dim_t k_reorder = make_multiple_of_n( k, 2 );
-
-	siz_t size_req = sizeof( int16_t ) * k_reorder * n_reorder;
-
-	return size_req;
-}
-
-AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
-{
-	if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
-	     ( k <= 0 ) || ( n <= 0 ) || ( ldb < n ) )
-	{
-		return; // Error.
-	}
-
-	// Check if avx512_bf16 ISA is supported, lpgemm matmul only works with it.
-	if ( bli_cpuid_is_avx512_bf16_supported() == FALSE )
-	{
-		printf(" AVX512_BF16 ISA not supported by processor, cannot perform lpgemm.\n");
-		return; // Error.
-	}
-
-	/* Initialize BLIS. */
-	bli_init_auto();
-
-	// Set MC, NC, KC, NR, MR.
-	aocl_lpgemm_init_global_cntx();
-
-	AOCL_MATRIX_TYPE input_mat_type;
-	bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
-
-	if ( input_mat_type == A_MATRIX )
-	{
-		return; // A reorder not supported.
-	}
-
-	// Create dummy b_reorder obj.
-	lpgemm_obj_t b_reorder;
-	b_reorder.storage.aligned_buffer = reorder_buf_addr;
-
-	// Create dummy original b obj;
-	lpgemm_obj_t b;
-	b.storage.aligned_buffer = ( void* )input_buf_addr;
-	b.rs = ldb;
-	b.width = n;
-	b.length = k;
-
-	reorderb_nr64_bf16bf16f32of32( &b, &b_reorder );
-}
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_gemm_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_config.h"
+#include "lpgemm_utils.h"
+#include "lpgemm_reorder_bf16.h"
+
+AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32)
+{
+	if ( ( k <= 0 ) || ( n <= 0 ) )
+	{
+		return 0; // Error.
+	}
+
+	// Check if avx512_bf16 ISA is supported, lpgemm matmul only works with it.
+	if ( bli_cpuid_is_avx512bf16_supported() == FALSE )
+	{
+		bli_print_msg(" AVX512_BF16 ISA not supported by processor, "
+				"cannot perform bf16bf16f32 gemm.", __FILE__, __LINE__ );
+		return 0; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	AOCL_MATRIX_TYPE input_mat_type;
+	bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
+
+	if ( input_mat_type == A_MATRIX )
+	{
+		return 0; // A reorder not supported.
+	}
+
+	// Extra space since packing does width in multiples of 16. The bf16
+	// instruction can be used as long as at least one zmm register can be fully
+	// loaded; and since k_dim needs to be at least 2, having n_dim at least 16
+	// should give 2x16=32 elements, enough for 1 zmm register.The padding is
+	// not rounded to NR (=64), since that would result in memory wastage.
+	dim_t n_reorder = make_multiple_of_n( n, 16 );
+
+	// Extra space since packing does length in multiples of 2.
+	dim_t k_reorder = make_multiple_of_n( k, 2 );
+
+	siz_t size_req = sizeof( int16_t ) * k_reorder * n_reorder;
+
+	return size_req;
+}
+
+AOCL_GEMM_REORDER(bfloat16, bf16bf16f32of32)
+{
+	if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
+	     ( k <= 0 ) || ( n <= 0 ) || ( ldb < n ) )
+	{
+		return; // Error.
+	}
+
+	// Check if avx512_bf16 ISA is supported, lpgemm matmul only works with it.
+	if ( bli_cpuid_is_avx512bf16_supported() == FALSE )
+	{
+		bli_print_msg(" AVX512_BF16 ISA not supported by processor, "
+				"cannot perform bf16bf16f32 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	AOCL_MATRIX_TYPE input_mat_type;
+	bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
+
+	if ( input_mat_type == A_MATRIX )
+	{
+		return; // A reorder not supported.
+	}
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global( &rntm_g );
+	bli_membrk_rntm_set_membrk( &rntm_g );
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
+
+	// Create dummy b_reorder obj.
+	lpgemm_obj_t b_reorder;
+	b_reorder.storage.aligned_buffer = reorder_buf_addr;
+
+	// Create dummy original b obj;
+	lpgemm_obj_t b;
+	b.storage.aligned_buffer = ( void* )input_buf_addr;
+	b.rs = ldb;
+	b.width = n;
+	b.length = k;
+
+	reorderb_nr64_bf16bf16f32of32( &b, &b_reorder, &rntm_g, lcntx_g );
+}
diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c
index fedf3a43c5..0e0f93e191 100644
--- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c
+++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32obf16.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -46,10 +46,24 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 	trans_t blis_transa;
 	trans_t blis_transb;
 
+	// There is this use case where lpgemm will be compiled using gcc9.4
+	// (where bf16 ISA is not supported), but deployed on a zen4+ sustem
+	// (which supports bf16 ISA). Here the bf16 kernels will be concealed
+	// and not compiled, and subsequently this api should error out and
+	// return early, even if bf16 ISA is supported by machine.
+#if defined( BLIS_GCC ) && ( __GNUC__ < 10 )
+	{
+		bli_print_msg("bf16bf16f32obf16 compiled using a compiler not "
+				"supporting BF16 ISA.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+#endif
+
 	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
-	if ( bli_cpuid_is_avx512_bf16_supported() == FALSE )
+	if ( bli_cpuid_is_avx512bf16_supported() == FALSE )
 	{
-		printf(" AVX512_BF16 ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX512_BF16 ISA not supported by processor, "
+				"cannot perform bf16bf16f32 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
@@ -158,6 +172,8 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 	bli_rntm_init_from_global( &rntm_g );
 	bli_membrk_rntm_set_membrk( &rntm_g );
 
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
+
 #ifdef BLIS_ENABLE_OPENMP
 	// Swapping inputs to induce row major computation for column major inputs.
 	if ( is_column_major == TRUE )
@@ -169,7 +185,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 		  a, rs_a, cs_a, mtag_a,
 		  ( float* )c, rs_c, cs_c,
 		  alpha, beta,
-		  &rntm_g,
+		  &rntm_g, lcntx_g,
 		  post_op_list, TRUE
 		);
 	}
@@ -182,7 +198,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 		  b, rs_b, cs_b, mtag_b,
 		  ( float* )c, rs_c, cs_c,
 		  alpha, beta,
-		  &rntm_g,
+		  &rntm_g, lcntx_g,
 		  post_op_list, TRUE
 		);
 	}
@@ -197,7 +213,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 		  a, rs_a, cs_a, mtag_a,
 		  ( float* )c, rs_c, cs_c,
 		  alpha, beta,
-		  &rntm_g,
+		  &rntm_g, lcntx_g,
 		  post_op_list, TRUE
 		);
 	}
@@ -210,7 +226,7 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 		  b, rs_b, cs_b, mtag_b,
 		  ( float* )c, rs_c, cs_c,
 		  alpha, beta,
-		  &rntm_g,
+		  &rntm_g, lcntx_g,
 		  post_op_list, TRUE
 		);
 	}
diff --git a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c
index 8f87f4dff3..ca8b160220 100644
--- a/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c
+++ b/addon/aocl_gemm/aocl_gemm_bf16bf16f32of32.c
@@ -1,218 +1,241 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "aocl_gemm_interface_apis.h"
-#include "lpgemm_types.h"
-#include "lpgemm_post_ops.h"
-#include "lpgemm_thread_decor_openmp.h"
-#include "lpgemm_5loop_interface_apis.h"
-#include "lpgemm_config.h"
-#include "lpgemm_utils.h"
-
-AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
-{
-	trans_t blis_transa;
-	trans_t blis_transb;
-
-	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
-	if ( bli_cpuid_is_avx512_bf16_supported() == FALSE )
-	{
-		printf(" AVX512_BF16 ISA not supported by processor, cannot perform lpgemm.\n");
-		return; // Error.
-	}
-
-	/* Initialize BLIS. */
-	bli_init_auto();
-
-	// Set MC, NC, KC, NR, MR.
-	aocl_lpgemm_init_global_cntx();
-
-	// Null check for pointers.
-	if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
-	{
-		return; // Error.
-	}
-
-	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
-	bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
-	bli_param_map_netlib_to_blis_trans( transb, &blis_transb );
-
-	/* Perform BLAS parameter checking. */
-	// Transpose not supported.
-	if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
-	     ( blis_transb != BLIS_NO_TRANSPOSE ) )
-	{
-		return; // Error.
-	}
-
-	// Sanitize order input.
-	char order_use =
-			( ( order == 'r' ) || ( order == 'R' ) ||
-			  ( order == 'c' ) || ( order == 'C' ) ) ?
-			order : 'r';
-
-	bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
-	bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
-
-	// Row major input expected with leading dimensions >= row stride.
-	if ( ( is_row_major == TRUE ) &&
-		 ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
-	{
-		return; // Error.
-	}
-	// Column major input expected with leading dimensions >= column stride.
-	else if ( ( is_column_major == TRUE ) &&
-			  ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
-	{
-		return; // Error.
-	}
-
-	// Check if dimensions are valid.
-	if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
-	     ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
-	{
-		return; // Error.
-	}
-
-	const inc_t rs_a = lda;
-	const inc_t cs_a = 1;
-	const inc_t rs_b = ldb;
-	const inc_t cs_b = 1;
-	const inc_t rs_c = ldc;
-	const inc_t cs_c = 1;
-
-	AOCL_MEMORY_TAG mtag_a;
-	AOCL_MEMORY_TAG mtag_b;
-
-	bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
-	bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
-
-	// B matrix needs to be packed in a certain format in order to be loaded
-	// and used in bf16 instrution. As such the mtag_b always needs to be either
-	// packed or reordered. B matrix as it is (unpacked) cannot be used, and
-	// the mtag_b is set to packed to enable runtime packing.
-	if ( ( is_row_major == TRUE ) && ( mtag_b == UNPACKED ) )
-	{
-		mtag_b = PACK;
-	}
-	// Inputs swapped in column major, A becomes B from kernel point of view.
-	else if ( ( is_column_major == TRUE ) && ( mtag_a == UNPACKED ) )
-	{
-		mtag_a = PACK;
-	}
-
-	// Only unpacked A supported now.
-	if ( ( is_row_major == TRUE ) && ( mtag_a != UNPACKED ) )
-	{
-		return; // Error.
-	}
-	// Inputs swapped in column major, B becomes A from kernel point of view.
-	else if ( ( is_column_major == TRUE ) && ( mtag_b != UNPACKED ) )
-	{
-		return; // Error.
-	}
-
-	// Convert post op struct to post op linked list format.
-	lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
-	lpgemm_translate_to_post_ops_list
-	(
-	  post_op_unparsed, post_op_list,
-	  ( void* )c, ( void* )( &order_use )
-	);
-
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_g;
-	bli_rntm_init_from_global( &rntm_g );
-	bli_membrk_rntm_set_membrk( &rntm_g );
-
-#ifdef BLIS_ENABLE_OPENMP
-	// Swapping inputs to induce row major computation for column major inputs.
-	if ( is_column_major == TRUE )
-	{
-		lpgemm_bf16bf16f32of32_openmp_thread_decorator
-		(
-		  n, m, k,
-		  b, rs_b, cs_b, mtag_b,
-		  a, rs_a, cs_a, mtag_a,
-		  c, rs_c, cs_c,
-		  alpha, beta,
-		  &rntm_g,
-		  post_op_list, FALSE
-		);
-	}
-	else
-	{
-		lpgemm_bf16bf16f32of32_openmp_thread_decorator
-		(
-		  m, n, k,
-		  a, rs_a, cs_a, mtag_a,
-		  b, rs_b, cs_b, mtag_b,
-		  c, rs_c, cs_c,
-		  alpha, beta,
-		  &rntm_g,
-		  post_op_list, FALSE
-		);
-	}
-#else
-	// Swapping inputs to induce row major computation for column major inputs.
-	if ( is_column_major == TRUE )
-	{
-		lpgemm_bf16bf16f32of32_thread_decorator
-		(
-		  n, m, k,
-		  b, rs_b, cs_b, mtag_b,
-		  a, rs_a, cs_a, mtag_a,
-		  c, rs_c, cs_c,
-		  alpha, beta,
-		  &rntm_g,
-		  post_op_list, FALSE
-		);
-	}
-	else
-	{
-		lpgemm_bf16bf16f32of32_thread_decorator
-		(
-		  m, n, k,
-		  a, rs_a, cs_a, mtag_a,
-		  b, rs_b, cs_b, mtag_b,
-		  c, rs_c, cs_c,
-		  alpha, beta,
-		  &rntm_g,
-		  post_op_list, FALSE
-		);
-	}
-#endif
-}
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_gemm_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_post_ops.h"
+#include "lpgemm_thread_decor_openmp.h"
+#include "lpgemm_5loop_interface_apis.h"
+#include "lpgemm_config.h"
+#include "lpgemm_utils.h"
+
+AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32)
+{
+	trans_t blis_transa;
+	trans_t blis_transb;
+
+	// There is this use case where lpgemm will be compiled using gcc9.4
+	// (where bf16 ISA is not supported), but deployed on a zen4+ sustem
+	// (which supports bf16 ISA). Here the bf16 kernels will be concealed
+	// and not compiled, and subsequently this api should error out and
+	// return early, even if bf16 ISA is supported by machine.
+#if defined( BLIS_GCC ) && ( __GNUC__ < 10 )
+	{
+		bli_print_msg("bf16bf16f32of32 compiled using a compiler not "
+				"supporting BF16 ISA.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+#endif
+
+	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
+	if ( bli_cpuid_is_avx512bf16_supported() == FALSE )
+	{
+		bli_print_msg(" AVX512_BF16 ISA not supported by processor, "
+				"cannot perform bf16bf16f32 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	// Null check for pointers.
+	if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
+	{
+		return; // Error.
+	}
+
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+	bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
+	bli_param_map_netlib_to_blis_trans( transb, &blis_transb );
+
+	/* Perform BLAS parameter checking. */
+	// Transpose not supported.
+	if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
+	     ( blis_transb != BLIS_NO_TRANSPOSE ) )
+	{
+		return; // Error.
+	}
+
+	// Sanitize order input.
+	char order_use =
+			( ( order == 'r' ) || ( order == 'R' ) ||
+			  ( order == 'c' ) || ( order == 'C' ) ) ?
+			order : 'r';
+
+	bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
+	bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
+
+	// Row major input expected with leading dimensions >= row stride.
+	if ( ( is_row_major == TRUE ) &&
+		 ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
+	{
+		return; // Error.
+	}
+	// Column major input expected with leading dimensions >= column stride.
+	else if ( ( is_column_major == TRUE ) &&
+			  ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
+	{
+		return; // Error.
+	}
+
+	// Check if dimensions are valid.
+	if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
+	     ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
+	{
+		return; // Error.
+	}
+
+	// The strides are set assuming a row major kernel.
+	const inc_t rs_a = lda;
+	const inc_t cs_a = 1;
+	const inc_t rs_b = ldb;
+	const inc_t cs_b = 1;
+	const inc_t rs_c = ldc;
+	const inc_t cs_c = 1;
+
+	AOCL_MEMORY_TAG mtag_a;
+	AOCL_MEMORY_TAG mtag_b;
+
+	bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
+	bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
+
+	if ( ( is_column_major == TRUE ) && ( mtag_b == REORDERED ) )
+	{
+		// Reorder not supported with column major inputs.
+		return;
+	}
+
+	// B matrix needs to be packed in a certain format in order to be loaded
+	// and used in bf16 instrution. As such the mtag_b always needs to be either
+	// packed or reordered. B matrix as it is (unpacked) cannot be used, and
+	// the mtag_b is set to packed to enable runtime packing.
+	if ( ( is_row_major == TRUE ) && ( mtag_b == UNPACKED ) )
+	{
+		mtag_b = PACK;
+	}
+	// Inputs swapped in column major, A becomes B from kernel point of view.
+	else if ( ( is_column_major == TRUE ) && ( mtag_a == UNPACKED ) )
+	{
+		mtag_a = PACK;
+	}
+
+	// Only unpacked A supported now.
+	if ( ( is_row_major == TRUE ) && ( mtag_a != UNPACKED ) )
+	{
+		return; // Error.
+	}
+	// Inputs swapped in column major, B becomes A from kernel point of view.
+	else if ( ( is_column_major == TRUE ) && ( mtag_b != UNPACKED ) )
+	{
+		return; // Error.
+	}
+
+	// Convert post op struct to post op linked list format.
+	lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
+	lpgemm_translate_to_post_ops_list
+	(
+	  post_op_unparsed, post_op_list,
+	  ( void* )c, ( void* )( &order_use )
+	);
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global( &rntm_g );
+	bli_membrk_rntm_set_membrk( &rntm_g );
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( BF16BF16F32OF32 );
+
+#ifdef BLIS_ENABLE_OPENMP
+	// Swapping inputs to induce row major computation for column major inputs.
+	if ( is_column_major == TRUE )
+	{
+		lpgemm_bf16bf16f32of32_openmp_thread_decorator
+		(
+		  n, m, k,
+		  b, rs_b, cs_b, mtag_b,
+		  a, rs_a, cs_a, mtag_a,
+		  c, rs_c, cs_c,
+		  alpha, beta,
+		  &rntm_g, lcntx_g,
+		  post_op_list, FALSE
+		);
+	}
+	else
+	{
+		lpgemm_bf16bf16f32of32_openmp_thread_decorator
+		(
+		  m, n, k,
+		  a, rs_a, cs_a, mtag_a,
+		  b, rs_b, cs_b, mtag_b,
+		  c, rs_c, cs_c,
+		  alpha, beta,
+		  &rntm_g, lcntx_g,
+		  post_op_list, FALSE
+		);
+	}
+#else
+	// Swapping inputs to induce row major computation for column major inputs.
+	if ( is_column_major == TRUE )
+	{
+		lpgemm_bf16bf16f32of32_thread_decorator
+		(
+		  n, m, k,
+		  b, rs_b, cs_b, mtag_b,
+		  a, rs_a, cs_a, mtag_a,
+		  c, rs_c, cs_c,
+		  alpha, beta,
+		  &rntm_g, lcntx_g,
+		  post_op_list, FALSE
+		);
+	}
+	else
+	{
+		lpgemm_bf16bf16f32of32_thread_decorator
+		(
+		  m, n, k,
+		  a, rs_a, cs_a, mtag_a,
+		  b, rs_b, cs_b, mtag_b,
+		  c, rs_c, cs_c,
+		  alpha, beta,
+		  &rntm_g, lcntx_g,
+		  post_op_list, FALSE
+		);
+	}
+#endif
+}
diff --git a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c
index 8366f746cb..f3eed1aa65 100644
--- a/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c
+++ b/addon/aocl_gemm/aocl_gemm_f32f32f32of32.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -37,6 +37,7 @@
 #include "lpgemm_types.h"
 #include "lpgemm_post_ops.h"
 #include "lpgemm_thread_decor_openmp.h"
+#include "lpgemm_config.h"
 #include "lpgemm_utils.h"
 #include "lpgemm_5loop_interface_apis.h"
 
@@ -45,16 +46,20 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
 	trans_t blis_transa;
 	trans_t blis_transb;
 
-	// Check if avx ISA is supported, lpgemm fp32 matmul only works with it.
-	if ( bli_cpuid_is_avx_supported() == FALSE )
+	// Check if AVX2 ISA is supported, lpgemm fp32 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
 	{
-		printf(" AVX2 ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform f32f32f32 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
 	/* Initialize BLIS. */
 	bli_init_auto();
 
+	// Initialize lpgemm context.
+	aocl_lpgemm_init_global_cntx();
+
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
 	AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(s), transa, transb, m, n, k,\
 	      (void*)&alpha, lda, ldb, (void*)&beta, ldc);
@@ -86,16 +91,20 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
 			( ( order == 'r' ) || ( order == 'R' ) ||
 			  ( order == 'c' ) || ( order == 'C' ) ) ?
 			order : 'r';
-	if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
+
+	bool is_row_major = ( ( order_use == 'r' ) || ( order_use == 'R' ) );
+	bool is_column_major = ( ( order_use == 'c' ) || ( order_use == 'C' ) );
+
+	// Row major input expected with leading dimensions >= row stride.
+	if ( ( is_row_major == TRUE ) &&
+		 ( ( lda < k ) || ( ldb < n ) || ( ldc < n ) ) )
 	{
-		return; // Only row major supported.
+		return; // Error.
 	}
-
-	// Row major input expected with leading dimensions equal to row stride.
-	if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
+	// Column major input expected with leading dimensions >= column stride.
+	else if ( ( is_column_major == TRUE ) &&
+			  ( ( lda < m ) || ( ldb < k ) || ( ldc < m ) ) )
 	{
-		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \
-						"Column major and general stride not supported.");
 		return; // Error.
 	}
 
@@ -108,6 +117,7 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
 		return; // Error.
 	}
 
+	// The strides are set assuming a row major kernel.
 	const inc_t rs_a = lda;
 	const inc_t cs_a = 1;
 	const inc_t rs_b = ldb;
@@ -121,11 +131,38 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
 	bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
 	bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
 
-	// Only unreordered A supported now.
-	if ( mtag_a != UNPACKED )
+	if ( ( is_column_major == TRUE ) && ( mtag_b == REORDERED ) )
+	{
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \
+					"Reordered B matrix not supported in column major case.");
+		return;
+	}
+
+	// By default enable packing for B matrix. Before the 5 loop, based on
+	// the input dimensions, the smart threading logic will adjust it
+	// (disable/enable) accordingly.
+	if ( ( is_row_major == TRUE ) && ( mtag_b == UNPACKED ) )
+	{
+		mtag_b = PACK;
+	}
+	// Inputs swapped in column major, A becomes B from kernel point of view.
+	else if ( ( is_column_major == TRUE ) && ( mtag_a == UNPACKED ) )
+	{
+		mtag_a = PACK;
+	}
+
+	// Reordered A not supported now.
+	if ( ( is_row_major == TRUE ) && ( mtag_a == REORDERED ) )
+	{
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \
+				"A matrix reordering not supported for row major inputs.");
+		return; // Error.
+	}
+	// Inputs swapped in column major, A becomes B from kernel point of view.
+	else if ( ( is_column_major == TRUE ) && ( mtag_b == REORDERED ) )
 	{
 		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, \
-						"A matrix packing/reordering not supported.");
+				"B matrix reordering not supported for column major inputs.");
 		return; // Error.
 	}
 
@@ -143,31 +180,71 @@ AOCL_GEMM_MATMUL(float,float,float,float,f32f32f32of32)
 	bli_rntm_init_from_global( &rntm_g );
 	bli_membrk_rntm_set_membrk( &rntm_g );
 
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( F32F32F32OF32 );
+
 #ifdef BLIS_ENABLE_OPENMP
-	lpgemm_f32f32f32of32_openmp_thread_decorator
-	(
-	  m, n, k,
-	  a, rs_a, cs_a, mtag_a,
-	  b, rs_b, cs_b, mtag_b,
-	  c, rs_c, cs_c,
-	  alpha, beta,
-	  &rntm_g,
-	  post_op_list, FALSE
-	);
+	// The lpgemm_cntx_t argument will be NULL for f32 since it still uses
+	// BLIS cntx_t internally. Its a workaround for now and will be replaced
+	// with lpgemm_cntx_t eventually.
+	// Swapping inputs to induce row major computation for column major inputs.
+	if ( is_column_major == TRUE )
+	{
+		lpgemm_f32f32f32of32_openmp_thread_decorator
+		(
+		  n, m, k,
+		  b, rs_b, cs_b, mtag_b,
+		  a, rs_a, cs_a, mtag_a,
+		  c, rs_c, cs_c,
+		  alpha, beta,
+		  &rntm_g, lcntx_g,
+		  post_op_list, FALSE
+		);
+	}
+	else
+	{
+		lpgemm_f32f32f32of32_openmp_thread_decorator
+		(
+		  m, n, k,
+		  a, rs_a, cs_a, mtag_a,
+		  b, rs_b, cs_b, mtag_b,
+		  c, rs_c, cs_c,
+		  alpha, beta,
+		  &rntm_g, lcntx_g,
+		  post_op_list, FALSE
+		);
+	}
 #else
-	// Setting pack A by default for non open mp case.
+	// Setting pack A and B by default for non open mp case.
 	bli_rntm_set_pack_a( 1, &rntm_g );
+	bli_rntm_set_pack_b( 1, &rntm_g );
 
-	lpgemm_f32f32f32of32_thread_decorator
-	(
-	  m, n, k,
-	  a, rs_a, cs_a, mtag_a,
-	  b, rs_b, cs_b, mtag_b,
-	  c, rs_c, cs_c,
-	  alpha, beta,
-	  &rntm_g,
-	  post_op_list, FALSE
-	);
+	// Swapping inputs to induce row major computation for column major inputs.
+	if ( is_column_major == TRUE )
+	{
+		lpgemm_f32f32f32of32_thread_decorator
+		(
+		  n, m, k,
+		  b, rs_b, cs_b, mtag_b,
+		  a, rs_a, cs_a, mtag_a,
+		  c, rs_c, cs_c,
+		  alpha, beta,
+		  &rntm_g, lcntx_g,
+		  post_op_list, FALSE
+		);
+	}
+	else
+	{
+		lpgemm_f32f32f32of32_thread_decorator
+		(
+		  m, n, k,
+		  a, rs_a, cs_a, mtag_a,
+		  b, rs_b, cs_b, mtag_b,
+		  c, rs_c, cs_c,
+		  alpha, beta,
+		  &rntm_g, lcntx_g,
+		  post_op_list, FALSE
+		);
+	}
 #endif
 
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
diff --git a/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c b/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c
index 948c1383de..2116e418af 100644
--- a/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_f32f32f32of32_utils.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,6 +34,7 @@
 
 #include "blis.h"
 #include "aocl_gemm_interface_apis.h"
+#include "lpgemm_config.h"
 #include "lpgemm_utils.h"
 
 AOCL_GEMM_GET_REORDER_BUF_SIZE(f32f32f32of32)
@@ -43,16 +44,20 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(f32f32f32of32)
 		return 0; // Error.
 	}
 
-	// Check if avx ISA is supported, lpgemm fp32 matmul only works with it.
-	if ( bli_cpuid_is_avx_supported() == FALSE )
+	// Check if AVX2 ISA is supported, lpgemm fp32 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
 	{
-		printf(" AVX2 ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform f32f32f32 gemm.", __FILE__, __LINE__ );
 		return 0; // Error.
 	}
 
 	/* Initialize BLIS. */
 	bli_init_auto();
 
+	// Initialize lpgemm context.
+	aocl_lpgemm_init_global_cntx();
+
 	// Query the global cntx.
 	cntx_t* cntx = bli_gks_query_cntx();
 
@@ -85,16 +90,20 @@ AOCL_GEMM_REORDER(float,f32f32f32of32)
 		return; // Error.
 	}
 
-	// Check if avx ISA is supported, lpgemm fp32 matmul only works with it.
-	if ( bli_cpuid_is_avx_supported() == FALSE )
+	// Check if AVX2 ISA is supported, lpgemm fp32 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
 	{
-		printf(" AVX2 ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform f32f32f32 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
 	/* Initialize BLIS. */
 	bli_init_auto();
 
+	// Initialize lpgemm context.
+	aocl_lpgemm_init_global_cntx();
+
 	// Query the global cntx.
 	cntx_t* cntx = bli_gks_query_cntx();
 
@@ -122,7 +131,7 @@ AOCL_GEMM_REORDER(float,f32f32f32of32)
 	float* restrict kappa_cast = &one_local;
 
 	// Set the schema to "row stored column panels" to indicate packing to
-	// conventional column-stored row panels.
+	// conventional row-stored column panels.
 	pack_t schema = BLIS_PACKED_COL_PANELS;
 	trans_t transc = BLIS_NO_TRANSPOSE;
 	conj_t conjc = bli_extract_conj( transc );
diff --git a/addon/aocl_gemm/aocl_gemm_interface_apis.h b/addon/aocl_gemm/aocl_gemm_interface_apis.h
index 40101cbe6a..718c0c3de2 100644
--- a/addon/aocl_gemm/aocl_gemm_interface_apis.h
+++ b/addon/aocl_gemm/aocl_gemm_interface_apis.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -51,6 +51,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(f32f32f32of32);
 AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s32os32);
 AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s16os16);
 AOCL_GEMM_GET_REORDER_BUF_SIZE(bf16bf16f32of32);
+AOCL_GEMM_GET_REORDER_BUF_SIZE(s8s8s32os32);
+AOCL_GEMM_GET_REORDER_BUF_SIZE(s8s8s16os16);
 
 // Performs reordering of input matrix. Reordering is the process of packing
 // the entire matrix upfront, so that the benefits of packed matrix is obtained
@@ -70,6 +72,8 @@ AOCL_GEMM_REORDER(float,f32f32f32of32);
 AOCL_GEMM_REORDER(int8_t,u8s8s32os32);
 AOCL_GEMM_REORDER(int8_t,u8s8s16os16);
 AOCL_GEMM_REORDER(bfloat16,bf16bf16f32of32);
+AOCL_GEMM_REORDER(int8_t,s8s8s32os32);
+AOCL_GEMM_REORDER(int8_t,s8s8s16os16);
 
 // Only supports matrices in row major format. This api can perform gemm with
 // both normal as well as reordered B matrix as opposesd to sgemm (only
@@ -103,5 +107,9 @@ AOCL_GEMM_MATMUL(bfloat16,bfloat16,float,float,bf16bf16f32of32);
 AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8);
 AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8);
 AOCL_GEMM_MATMUL(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16);
+AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32);
+AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8);
+AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16);
+AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8);
 
 #endif // AOCL_GEMM_INTERFACE_H
diff --git a/addon/aocl_gemm/aocl_gemm_post_ops.h b/addon/aocl_gemm/aocl_gemm_post_ops.h
index 86034598ac..70084e741a 100644
--- a/addon/aocl_gemm/aocl_gemm_post_ops.h
+++ b/addon/aocl_gemm/aocl_gemm_post_ops.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,6 +41,9 @@ typedef enum
 {
 	RELU = 0,
 	PRELU = 1,
+	GELU_TANH = 2,
+	GELU_ERF = 3,
+	CLIP = 4,
 } AOCL_ELT_ALGO_TYPE;
 
 typedef enum
@@ -81,7 +84,7 @@ typedef struct
 typedef struct
 {
 	aocl_post_op_sum sum;
-	aocl_post_op_eltwise eltwise;
+	aocl_post_op_eltwise* eltwise; //Multiple eltwise allowed.
 	aocl_post_op_bias bias;
 
 	// eg: seq_length = 2
diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c
new file mode 100644
index 0000000000..ca5ee12fc2
--- /dev/null
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os16.c
@@ -0,0 +1,170 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_gemm_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_5loop_interface_apis.h"
+#include "lpgemm_config.h"
+#include "lpgemm_thread_decor_openmp.h"
+#include "lpgemm_post_ops.h"
+#include "lpgemm_utils_s8.h"
+
+AOCL_GEMM_MATMUL(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
+{
+	trans_t blis_transa;
+	trans_t blis_transb;
+
+	// Check if AVX2 ISA is supported, lpgemm s8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
+	{
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform s8s8s16 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	// Null check for pointers.
+	if ((a == NULL) || (b == NULL) || (c == NULL))
+	{
+		return; // Error.
+	}
+
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+	bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
+	bli_param_map_netlib_to_blis_trans(transb, &blis_transb);
+
+	/* Perform BLAS parameter checking. */
+	// Transpose not supported.
+	if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
+		 ( blis_transb != BLIS_NO_TRANSPOSE ) )
+	{
+		return; // Error.
+	}
+
+	// Sanitize order input.
+	char order_use =
+			( ( order == 'r' ) || ( order == 'R' ) ||
+			  ( order == 'c' ) || ( order == 'C' ) ) ?
+			order : 'r';
+	if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
+	{
+		return; // Only row major supported.
+	}
+
+	// Row major input expected with leading dimensions equal to row stride.
+	if ((lda != k) || (ldb != n) || (ldc != n))
+	{
+		return; // Error.
+	}
+
+	// Check if dimensions are valid.
+	if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
+	{
+		return; // Error.
+	}
+
+	const inc_t rs_a = lda;
+	const inc_t cs_a = 1;
+	const inc_t rs_b = ldb;
+	const inc_t cs_b = 1;
+	const inc_t rs_c = ldc;
+	const inc_t cs_c = 1;
+
+	AOCL_MEMORY_TAG mtag_a;
+	AOCL_MEMORY_TAG mtag_b;
+
+	bli_param_map_char_to_lpmtag(mem_format_a, &mtag_a);
+	bli_param_map_char_to_lpmtag(mem_format_b, &mtag_b);
+
+	// B matrix needs to be packed in a certain format in order to be loaded
+	// and used in VNNI instrution. As such the mtag_b always needs to be either
+	// packed or reordered. B matrix as it is (unpacked) cannot be used, and
+	// the mtag_b is set to packed to enable runtime packing.
+	if (mtag_b == UNPACKED)
+	{
+		mtag_b = PACK;
+	}
+
+	// Only unpacked A supported now.
+	if (mtag_a != UNPACKED)
+	{
+		return; // Error.
+	}
+
+	// Convert post op struct to post op linked list format.
+	lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
+	lpgemm_translate_to_post_ops_list
+	(
+	  post_op_unparsed, post_op_list,
+	  ( void* )c, ( void* )( &order_use )
+	);
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global(&rntm_g);
+	bli_membrk_rntm_set_membrk(&rntm_g);
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );
+
+#ifdef BLIS_ENABLE_OPENMP
+	lpgemm_s8s8s16o16_openmp_thread_decorator
+	(
+	  m, n, k,
+	  a, rs_a, cs_a, mtag_a,
+	  b, rs_b, cs_b, mtag_b,
+	  c, rs_c, cs_c,
+	  alpha, beta,
+	  &rntm_g, lcntx_g,
+	  post_op_list, FALSE
+	);
+#else
+	lpgemm_s8s8s16o16_thread_decorator
+	(
+	  m, n, k,
+	  a, rs_a, cs_a, mtag_a,
+	  b, rs_b, cs_b, mtag_b,
+	  c, rs_c, cs_c,
+	  alpha, beta,
+	  &rntm_g, lcntx_g,
+	  post_op_list, FALSE
+	);
+#endif
+}
diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os16_utils.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os16_utils.c
new file mode 100644
index 0000000000..92a2663944
--- /dev/null
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os16_utils.c
@@ -0,0 +1,137 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_gemm_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_config.h"
+#include "lpgemm_utils_s8.h"
+#include "lpgemm_reorder_s8s16.h"
+
+AOCL_GEMM_GET_REORDER_BUF_SIZE(s8s8s16os16)
+{
+	if ((k <= 0) || (n <= 0))
+	{
+		return 0; // Error.
+	}
+
+	// Check if AVX2 ISA is supported, lpgemm s8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
+	{
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform s8s8s16 gemm.", __FILE__, __LINE__ );
+		return 0; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	AOCL_MATRIX_TYPE input_mat_type;
+	bli_param_map_char_to_lpmat_type(mat_type, &input_mat_type);
+
+	if (input_mat_type == A_MATRIX)
+	{
+		return 0; // A reorder not supported.
+	}
+
+	// Extra space since packing does width in multiples of 16. The vpmaddubsw
+	// instruction can be used as long as atleast one ymm register can be fully
+	// loaded; and since k_dim needs to be at least 2, having n_dim atleast 16
+	// should give 2x16=32 elements, enough for 1 ymm register.The padding is
+	// not rounded to NR (=16), since that would result in memory wastage.
+	dim_t n_reorder = make_multiple_of_n(n, 16);
+
+	// Extra space since packing does length in multiples of 2.
+	dim_t k_reorder = make_multiple_of_n(k, 2);
+
+	// Extra memory of n_reorder * sizeof( int16_t ) to store sum of every column of B matrix buffer
+    siz_t size_req = sizeof(int8_t) * k_reorder * n_reorder + ( n_reorder * sizeof( int16_t ));
+
+	return size_req;
+}
+
+AOCL_GEMM_REORDER(int8_t,s8s8s16os16)
+{
+	if ((input_buf_addr == NULL) || (reorder_buf_addr == NULL) ||
+		(k <= 0) || (n <= 0) || (ldb < n))
+	{
+		return; // Error.
+	}
+
+	// Check if AVX2 ISA is supported, lpgemm s8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
+	{
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform s8s8s16 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	AOCL_MATRIX_TYPE input_mat_type;
+	bli_param_map_char_to_lpmat_type(mat_type, &input_mat_type);
+
+	if (input_mat_type == A_MATRIX)
+	{
+		return; // A reorder not supported.
+	}
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global(&rntm_g);
+	bli_membrk_rntm_set_membrk(&rntm_g);
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );
+
+	// Create dummy b_reorder obj.
+	lpgemm_obj_t b_reorder;
+	b_reorder.storage.aligned_buffer = reorder_buf_addr;
+
+	// Create dummy original b obj;
+	lpgemm_obj_t b;
+	b.storage.aligned_buffer = (void *)input_buf_addr;
+	b.rs = ldb;
+	b.width = n;
+	b.length = k;
+
+	aocl_reorderb_nr32_s8s8s16o16( &b, &b_reorder, &rntm_g, lcntx_g );
+}
diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c
new file mode 100644
index 0000000000..a036612c82
--- /dev/null
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s16os8.c
@@ -0,0 +1,170 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_gemm_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_5loop_interface_apis.h"
+#include "lpgemm_config.h"
+#include "lpgemm_thread_decor_openmp.h"
+#include "lpgemm_post_ops.h"
+#include "lpgemm_utils_s8.h"
+
+AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
+{
+	trans_t blis_transa;
+	trans_t blis_transb;
+
+	// Check if AVX2 ISA is supported, lpgemm s8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
+	{
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform s8s8s16 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	// Null check for pointers.
+	if ((a == NULL) || (b == NULL) || (c == NULL))
+	{
+		return; // Error.
+	}
+
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+	bli_param_map_netlib_to_blis_trans(transa, &blis_transa);
+	bli_param_map_netlib_to_blis_trans(transb, &blis_transb);
+
+	/* Perform BLAS parameter checking. */
+	// Transpose not supported.
+	if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
+		 ( blis_transb != BLIS_NO_TRANSPOSE ) )
+	{
+		return; // Error.
+	}
+
+	// Sanitize order input.
+	char order_use =
+			( ( order == 'r' ) || ( order == 'R' ) ||
+			  ( order == 'c' ) || ( order == 'C' ) ) ?
+			order : 'r';
+	if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
+	{
+		return; // Only row major supported.
+	}
+
+	// Row major input expected with leading dimensions equal to row stride.
+	if ((lda != k) || (ldb != n) || (ldc != n))
+	{
+		return; // Error.
+	}
+
+	// Check if dimensions are valid.
+	if ((m <= 0) || (n <= 0) || (k <= 0) || (lda <= 0) || (ldb <= 0) || (ldc <= 0))
+	{
+		return; // Error.
+	}
+
+	const inc_t rs_a = lda;
+	const inc_t cs_a = 1;
+	const inc_t rs_b = ldb;
+	const inc_t cs_b = 1;
+	const inc_t rs_c = ldc;
+	const inc_t cs_c = 1;
+
+	AOCL_MEMORY_TAG mtag_a;
+	AOCL_MEMORY_TAG mtag_b;
+
+	bli_param_map_char_to_lpmtag(mem_format_a, &mtag_a);
+	bli_param_map_char_to_lpmtag(mem_format_b, &mtag_b);
+
+	// B matrix needs to be packed in a certain format in order to be loaded
+	// and used in VNNI instrution. As such the mtag_b always needs to be either
+	// packed or reordered. B matrix as it is (unpacked) cannot be used, and
+	// the mtag_b is set to packed to enable runtime packing.
+	if (mtag_b == UNPACKED)
+	{
+		mtag_b = PACK;
+	}
+
+	// Only unpacked A supported now.
+	if (mtag_a != UNPACKED)
+	{
+		return; // Error.
+	}
+
+	// Convert post op struct to post op linked list format.
+	lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
+	lpgemm_translate_to_post_ops_list
+	(
+	  post_op_unparsed, post_op_list,
+	  ( void* )c, ( void* )( &order_use )
+	);
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global(&rntm_g);
+	bli_membrk_rntm_set_membrk(&rntm_g);
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S16OS16 );
+
+#ifdef BLIS_ENABLE_OPENMP
+	lpgemm_s8s8s16o16_openmp_thread_decorator
+	(
+	  m, n, k,
+	  a, rs_a, cs_a, mtag_a,
+	  b, rs_b, cs_b, mtag_b,
+	  ( int16_t* )c, rs_c, cs_c,
+	  alpha, beta,
+	  &rntm_g, lcntx_g,
+	  post_op_list, TRUE
+	);
+#else
+	lpgemm_s8s8s16o16_thread_decorator
+	(
+	  m, n, k,
+	  a, rs_a, cs_a, mtag_a,
+	  b, rs_b, cs_b, mtag_b,
+	  ( int16_t* )c, rs_c, cs_c,
+	  alpha, beta,
+	  &rntm_g, lcntx_g,
+	  post_op_list, TRUE
+	);
+#endif
+}
diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c
new file mode 100644
index 0000000000..b9ddecdba5
--- /dev/null
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os32.c
@@ -0,0 +1,171 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_gemm_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_post_ops.h"
+#include "lpgemm_thread_decor_openmp.h"
+#include "lpgemm_5loop_interface_apis.h"
+#include "lpgemm_config.h"
+#include "lpgemm_utils_s8.h"
+
+AOCL_GEMM_MATMUL(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
+{
+	trans_t blis_transa;
+	trans_t blis_transb;
+
+	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
+	if ( bli_cpuid_is_avx512vnni_supported() == FALSE )
+	{
+		bli_print_msg(" AVX512_VNNI ISA not supported by processor, "
+				"cannot perform s8s8s32 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	// Null check for pointers.
+	if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
+	{
+		return; // Error.
+	}
+
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+	bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
+	bli_param_map_netlib_to_blis_trans( transb, &blis_transb );
+
+	/* Perform BLAS parameter checking. */
+	// Transpose not supported.
+	if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
+	     ( blis_transb != BLIS_NO_TRANSPOSE ) )
+	{
+		return; // Error.
+	}
+
+	// Sanitize order input.
+	char order_use =
+			( ( order == 'r' ) || ( order == 'R' ) ||
+			  ( order == 'c' ) || ( order == 'C' ) ) ?
+			order : 'r';
+	if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
+	{
+		return; // Only row major supported.
+	}
+
+	// Row major input expected with leading dimensions equal to row stride.
+	if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
+	{
+		return; // Error.
+	}
+
+	// Check if dimensions are valid.
+	if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
+	     ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
+	{
+		return; // Error.
+	}
+
+	const inc_t rs_a = lda;
+	const inc_t cs_a = 1;
+	const inc_t rs_b = ldb;
+	const inc_t cs_b = 1;
+	const inc_t rs_c = ldc;
+	const inc_t cs_c = 1;
+
+	AOCL_MEMORY_TAG mtag_a;
+	AOCL_MEMORY_TAG mtag_b;
+
+	bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
+	bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
+
+	// B matrix needs to be packed in a certain format in order to be loaded
+	// and used in VNNI instrution. As such the mtag_b always needs to be either
+	// packed or reordered. B matrix as it is (unpacked) cannot be used, and
+	// the mtag_b is set to packed to enable runtime packing.
+	if ( mtag_b == UNPACKED )
+	{
+		mtag_b = PACK;
+	}
+
+	// Only unpacked A supported now.
+	if ( mtag_a != UNPACKED )
+	{
+		return; // Error.
+	}
+
+	// Convert post op struct to post op linked list format.
+	lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
+	lpgemm_translate_to_post_ops_list
+	(
+	  post_op_unparsed, post_op_list,
+	  ( void* )c, ( void* )( &order_use )
+	);
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global( &rntm_g );
+	bli_membrk_rntm_set_membrk( &rntm_g );
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );
+
+#ifdef BLIS_ENABLE_OPENMP
+	lpgemm_s8s8s32o32_openmp_thread_decorator
+	(
+	  m, n, k,
+	  a, rs_a, cs_a, mtag_a,
+	  b, rs_b, cs_b, mtag_b,
+	  c, rs_c, cs_c,
+	  alpha, beta,
+	  &rntm_g, lcntx_g,
+	  post_op_list, FALSE
+	);
+#else
+	lpgemm_s8s8s32o32_thread_decorator
+	(
+	  m, n, k,
+	  a, rs_a, cs_a, mtag_a,
+	  b, rs_b, cs_b, mtag_b,
+	  c, rs_c, cs_c,
+	  alpha, beta,
+	  &rntm_g, lcntx_g,
+	  post_op_list, FALSE
+	);
+#endif
+}
diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os32_utils.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os32_utils.c
new file mode 100644
index 0000000000..4c41d8e184
--- /dev/null
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os32_utils.c
@@ -0,0 +1,137 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_gemm_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_config.h"
+#include "lpgemm_utils_s8.h"
+#include "lpgemm_reorder_s8.h"
+
+AOCL_GEMM_GET_REORDER_BUF_SIZE(s8s8s32os32)
+{
+	if ( ( k <= 0 ) || ( n <= 0 ) )
+	{
+		return 0; // Error.
+	}
+
+	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
+	if ( bli_cpuid_is_avx512vnni_supported() == FALSE )
+	{
+		bli_print_msg(" AVX512_VNNI ISA not supported by processor, "
+				"cannot perform s8s8s32 gemm.", __FILE__, __LINE__ );
+		return 0; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	AOCL_MATRIX_TYPE input_mat_type;
+	bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
+
+	if ( input_mat_type == A_MATRIX )
+	{
+		return 0; // A reorder not supported.
+	}
+
+	// Extra space since packing does width in multiples of 16. The vnni
+	// instruction can be used as long as atleast one zmm register can be fully
+	// loaded; and since k_dim needs to be atleast 4, having n_dim atleast 16
+	// should give 4x16=64 elements, enough for 1 zmm register.The padding is
+	// not rounded to NR (=64), since that would result in memory wastage.
+	dim_t n_reorder = make_multiple_of_n( n, 16 );
+
+	// Extra space since packing does length in multiples of 4.
+	dim_t k_reorder = make_multiple_of_n( k, 4 );
+
+	//extra memory of n_reorder * sizeof(int32_t) to store sum of every column of B matrix buffer
+	siz_t size_req = sizeof( int8_t ) * k_reorder * n_reorder + ( n_reorder * sizeof( int32_t ) );
+
+	return size_req;
+}
+
+AOCL_GEMM_REORDER(int8_t,s8s8s32os32)
+{
+	if ( ( input_buf_addr == NULL ) || ( reorder_buf_addr == NULL ) ||
+	     ( k <= 0 ) || ( n <= 0 ) || ( ldb < n ) )
+	{
+		return; // Error.
+	}
+
+	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
+	if ( bli_cpuid_is_avx512vnni_supported() == FALSE )
+	{
+		bli_print_msg(" AVX512_VNNI ISA not supported by processor, "
+				"cannot perform s8s8s32 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	AOCL_MATRIX_TYPE input_mat_type;
+	bli_param_map_char_to_lpmat_type( mat_type, &input_mat_type );
+
+	if ( input_mat_type == A_MATRIX )
+	{
+		return; // A reorder not supported.
+	}
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global( &rntm_g );
+	bli_membrk_rntm_set_membrk( &rntm_g );
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );
+
+	// Create dummy b_reorder obj.
+	lpgemm_obj_t b_reorder;
+	b_reorder.storage.aligned_buffer = reorder_buf_addr;
+
+	// Create dummy original b obj;
+	lpgemm_obj_t b;
+	b.storage.aligned_buffer = ( void* )input_buf_addr;
+	b.rs = ldb;
+	b.width = n;
+	b.length = k;
+
+	reorderb_nr64_s8s8s32o32( &b, &b_reorder, &rntm_g, lcntx_g );
+}
diff --git a/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c
new file mode 100644
index 0000000000..7abc392a4e
--- /dev/null
+++ b/addon/aocl_gemm/aocl_gemm_s8s8s32os8.c
@@ -0,0 +1,171 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_gemm_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_post_ops.h"
+#include "lpgemm_thread_decor_openmp.h"
+#include "lpgemm_5loop_interface_apis.h"
+#include "lpgemm_config.h"
+#include "lpgemm_utils_s8.h"
+
+AOCL_GEMM_MATMUL(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
+{
+	trans_t blis_transa;
+	trans_t blis_transb;
+
+	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
+	if ( bli_cpuid_is_avx512vnni_supported() == FALSE )
+	{
+		bli_print_msg(" AVX512_VNNI ISA not supported by processor, "
+				"cannot perform s8s8s32 gemm.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	// Null check for pointers.
+	if ( ( a == NULL ) || ( b == NULL ) || ( c == NULL ) )
+	{
+		return; // Error.
+	}
+
+	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
+	bli_param_map_netlib_to_blis_trans( transa, &blis_transa );
+	bli_param_map_netlib_to_blis_trans( transb, &blis_transb );
+
+	/* Perform BLAS parameter checking. */
+	// Transpose not supported.
+	if ( ( blis_transa != BLIS_NO_TRANSPOSE ) ||
+	     ( blis_transb != BLIS_NO_TRANSPOSE ) )
+	{
+		return; // Error.
+	}
+
+	// Sanitize order input.
+	char order_use =
+			( ( order == 'r' ) || ( order == 'R' ) ||
+			  ( order == 'c' ) || ( order == 'C' ) ) ?
+			order : 'r';
+	if ( ( order_use != 'r' ) && ( order_use != 'R' ) )
+	{
+		return; // Only row major supported.
+	}
+
+	// Row major input expected with leading dimensions equal to row stride.
+	if ( ( lda != k ) || ( ldb != n ) || ( ldc != n ) )
+	{
+		return; // Error.
+	}
+
+	// Check if dimensions are valid.
+	if ( ( m <= 0) || ( n <= 0 ) || ( k <= 0 ) ||
+	     ( lda <= 0 ) || ( ldb <= 0 ) || ( ldc <= 0 ) )
+	{
+		return; // Error.
+	}
+
+	const inc_t rs_a = lda;
+	const inc_t cs_a = 1;
+	const inc_t rs_b = ldb;
+	const inc_t cs_b = 1;
+	const inc_t rs_c = ldc;
+	const inc_t cs_c = 1;
+
+	AOCL_MEMORY_TAG mtag_a;
+	AOCL_MEMORY_TAG mtag_b;
+
+	bli_param_map_char_to_lpmtag( mem_format_a, &mtag_a );
+	bli_param_map_char_to_lpmtag( mem_format_b, &mtag_b );
+
+	// B matrix needs to be packed in a certain format in order to be loaded
+	// and used in VNNI instrution. As such the mtag_b always needs to be either
+	// packed or reordered. B matrix as it is (unpacked) cannot be used, and
+	// the mtag_b is set to packed to enable runtime packing.
+	if ( mtag_b == UNPACKED )
+	{
+		mtag_b = PACK;
+	}
+
+	// Only unpacked A supported now.
+	if ( mtag_a != UNPACKED )
+	{
+		return; // Error.
+	}
+
+	// Convert post op struct to post op linked list format.
+	lpgemm_post_op post_op_list[AOCL_MAX_POST_OPS];
+	lpgemm_translate_to_post_ops_list
+	(
+	  post_op_unparsed, post_op_list,
+	  ( void* )c, ( void* )( &order_use )
+	);
+
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global( &rntm_g );
+	bli_membrk_rntm_set_membrk( &rntm_g );
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( S8S8S32OS32 );
+
+#ifdef BLIS_ENABLE_OPENMP
+	lpgemm_s8s8s32o32_openmp_thread_decorator
+	(
+	  m, n, k,
+	  a, rs_a, cs_a, mtag_a,
+	  b, rs_b, cs_b, mtag_b,
+	  ( int32_t* )c, rs_c, cs_c,
+	  alpha, beta,
+	  &rntm_g, lcntx_g,
+	  post_op_list, TRUE
+	);
+#else
+	lpgemm_s8s8s32o32_thread_decorator
+	(
+	  m, n, k,
+	  a, rs_a, cs_a, mtag_a,
+	  b, rs_b, cs_b, mtag_b,
+	  ( int32_t* )c, rs_c, cs_c,
+	  alpha, beta,
+	  &rntm_g, lcntx_g,
+	  post_op_list, TRUE
+	);
+#endif
+}
diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c
index 1c6b0899ad..f851a283d5 100644
--- a/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os16.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -46,10 +46,11 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
 	trans_t blis_transa;
 	trans_t blis_transb;
 
-	// Check if avx ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
-	if ( bli_cpuid_is_avx_supported() == FALSE )
+	// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
 	{
-		printf(" AVX2 ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform u8s8s16 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
@@ -141,6 +142,8 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
 	bli_rntm_init_from_global(&rntm_g);
 	bli_membrk_rntm_set_membrk(&rntm_g);
 
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
+
 #ifdef BLIS_ENABLE_OPENMP
 	lpgemm_u8s8s16o16_openmp_thread_decorator
 	(
@@ -149,7 +152,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
 	  b, rs_b, cs_b, mtag_b,
 	  c, rs_c, cs_c,
 	  alpha, beta,
-	  &rntm_g,
+	  &rntm_g, lcntx_g,
 	  post_op_list, FALSE
 	);
 #else
@@ -160,7 +163,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
 	  b, rs_b, cs_b, mtag_b,
 	  c, rs_c, cs_c,
 	  alpha, beta,
-	  &rntm_g,
+	  &rntm_g, lcntx_g,
 	  post_op_list, FALSE
 	);
 #endif
diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c
index 5cadd206d5..98d8828f22 100644
--- a/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os16_utils.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -46,10 +46,11 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s16os16)
 		return 0; // Error.
 	}
 
-	// Check if avx ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
-	if ( bli_cpuid_is_avx_supported() == FALSE )
+	// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
 	{
-		printf(" AVX2 ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform u8s8s16 gemm.", __FILE__, __LINE__ );
 		return 0; // Error.
 	}
 
@@ -68,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s16os16)
 	}
 
 	// Extra space since packing does width in multiples of 16. The vpmaddubsw
-	// instruction can be used as long as atleast one ymm register can be fully
-	// loaded; and since k_dim needs to be at least 2, having n_dim atleast 16
+	// instruction can be used as long as at least one ymm register can be fully
+	// loaded; and since k_dim needs to be at least 2, having n_dim at least 16
 	// should give 2x16=32 elements, enough for 1 ymm register.The padding is
 	// not rounded to NR (=16), since that would result in memory wastage.
 	dim_t n_reorder = make_multiple_of_n(n, 16);
@@ -90,10 +91,11 @@ AOCL_GEMM_REORDER(int8_t,u8s8s16os16)
 		return; // Error.
 	}
 
-	// Check if avx ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
-	if ( bli_cpuid_is_avx_supported() == FALSE )
+	// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
 	{
-		printf(" AVX2 ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform u8s8s16 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
@@ -111,6 +113,14 @@ AOCL_GEMM_REORDER(int8_t,u8s8s16os16)
 		return; // A reorder not supported.
 	}
 
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global(&rntm_g);
+	bli_membrk_rntm_set_membrk(&rntm_g);
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
+
 	// Create dummy b_reorder obj.
 	lpgemm_obj_t b_reorder;
 	b_reorder.storage.aligned_buffer = reorder_buf_addr;
@@ -122,5 +132,5 @@ AOCL_GEMM_REORDER(int8_t,u8s8s16os16)
 	b.width = n;
 	b.length = k;
 
-	aocl_reorderb_nr32_u8s8s16o16(&b, &b_reorder);
+	aocl_reorderb_nr32_u8s8s16o16( &b, &b_reorder, &rntm_g, lcntx_g );
 }
diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c
index fed10c1e01..c4ca0ac572 100644
--- a/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s16os8.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -46,10 +46,11 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
 	trans_t blis_transa;
 	trans_t blis_transb;
 
-	// Check if avx ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
-	if ( bli_cpuid_is_avx_supported() == FALSE )
+	// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
 	{
-		printf(" AVX2 ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX2 ISA not supported by processor, "
+				"cannot perform u8s8s16 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
@@ -141,6 +142,8 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
 	bli_rntm_init_from_global(&rntm_g);
 	bli_membrk_rntm_set_membrk(&rntm_g);
 
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S16OS16 );
+
 #ifdef BLIS_ENABLE_OPENMP
 	lpgemm_u8s8s16o16_openmp_thread_decorator
 	(
@@ -149,7 +152,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
 	  b, rs_b, cs_b, mtag_b,
 	  ( int16_t* )c, rs_c, cs_c,
 	  alpha, beta,
-	  &rntm_g,
+	  &rntm_g, lcntx_g,
 	  post_op_list, TRUE
 	);
 #else
@@ -160,7 +163,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int16_t,u8s8s16os8)
 	  b, rs_b, cs_b, mtag_b,
 	  ( int16_t* )c, rs_c, cs_c,
 	  alpha, beta,
-	  &rntm_g,
+	  &rntm_g, lcntx_g,
 	  post_op_list, TRUE
 	);
 #endif
diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c
index 39fd49bca4..5580001d69 100644
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -49,7 +49,8 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
 	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
 	if ( bli_cpuid_is_avx512vnni_supported() == FALSE )
 	{
-		printf(" AVX512_VNNI ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX512_VNNI ISA not supported by processor, "
+				"cannot perform u8s8s32 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
@@ -142,6 +143,8 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
 	bli_rntm_init_from_global( &rntm_g );
 	bli_membrk_rntm_set_membrk( &rntm_g );
 
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );
+
 #ifdef BLIS_ENABLE_OPENMP
 	lpgemm_u8s8s32o32_openmp_thread_decorator
 	(
@@ -150,7 +153,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
 	  b, rs_b, cs_b, mtag_b,
 	  c, rs_c, cs_c,
 	  alpha, beta,
-	  &rntm_g,
+	  &rntm_g, lcntx_g,
 	  post_op_list, FALSE
 	);
 #else
@@ -161,7 +164,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
 	  b, rs_b, cs_b, mtag_b,
 	  c, rs_c, cs_c,
 	  alpha, beta,
-	  &rntm_g,
+	  &rntm_g, lcntx_g,
 	  post_op_list, FALSE
 	);
 #endif
diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c
index 11f9f6937a..20f0b322d9 100644
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os32_utils.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -49,7 +49,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s32os32)
 	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
 	if ( bli_cpuid_is_avx512vnni_supported() == FALSE )
 	{
-		printf(" AVX512_VNNI ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX512_VNNI ISA not supported by processor, "
+				"cannot perform u8s8s32 gemm.", __FILE__, __LINE__ );
 		return 0; // Error.
 	}
 
@@ -68,8 +69,8 @@ AOCL_GEMM_GET_REORDER_BUF_SIZE(u8s8s32os32)
 	}
 
 	// Extra space since packing does width in multiples of 16. The vnni
-	// instruction can be used as long as atleast one zmm register can be fully
-	// loaded; and since k_dim needs to be atleast 4, having n_dim atleast 16
+	// instruction can be used as long as at least one zmm register can be fully
+	// loaded; and since k_dim needs to be at least 4, having n_dim at least 16
 	// should give 4x16=64 elements, enough for 1 zmm register.The padding is
 	// not rounded to NR (=64), since that would result in memory wastage.
 	dim_t n_reorder = make_multiple_of_n( n, 16 );
@@ -93,7 +94,8 @@ AOCL_GEMM_REORDER(int8_t,u8s8s32os32)
 	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
 	if ( bli_cpuid_is_avx512vnni_supported() == FALSE )
 	{
-		printf(" AVX512_VNNI ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX512_VNNI ISA not supported by processor, "
+				"cannot perform u8s8s32 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
@@ -111,6 +113,14 @@ AOCL_GEMM_REORDER(int8_t,u8s8s32os32)
 		return; // A reorder not supported.
 	}
 
+	// Initialize a local runtime with global settings if necessary. Note
+	// that in the case that a runtime is passed in, we make a local copy.
+	rntm_t rntm_g;
+	bli_rntm_init_from_global( &rntm_g );
+	bli_membrk_rntm_set_membrk( &rntm_g );
+
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );
+
 	// Create dummy b_reorder obj.
 	lpgemm_obj_t b_reorder;
 	b_reorder.storage.aligned_buffer = reorder_buf_addr;
@@ -122,5 +132,5 @@ AOCL_GEMM_REORDER(int8_t,u8s8s32os32)
 	b.width = n;
 	b.length = k;
 
-	reorderb_nr64_u8s8s32o32( &b, &b_reorder );
+	reorderb_nr64_u8s8s32o32( &b, &b_reorder, &rntm_g, lcntx_g );
 }
diff --git a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c
index e4a4ce3f2d..55f062ee8f 100644
--- a/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c
+++ b/addon/aocl_gemm/aocl_gemm_u8s8s32os8.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -49,7 +49,8 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
 	// Check if avx512_vnni ISA is supported, lpgemm matmul only works with it.
 	if ( bli_cpuid_is_avx512vnni_supported() == FALSE )
 	{
-		printf(" AVX512_VNNI ISA not supported by processor, cannot perform lpgemm.\n");
+		bli_print_msg(" AVX512_VNNI ISA not supported by processor, "
+				"cannot perform u8s8s32 gemm.", __FILE__, __LINE__ );
 		return; // Error.
 	}
 
@@ -142,6 +143,8 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
 	bli_rntm_init_from_global( &rntm_g );
 	bli_membrk_rntm_set_membrk( &rntm_g );
 
+	lpgemm_cntx_t* lcntx_g = lpgemm_get_global_cntx_obj( U8S8S32OS32 );
+
 #ifdef BLIS_ENABLE_OPENMP
 	lpgemm_u8s8s32o32_openmp_thread_decorator
 	(
@@ -150,7 +153,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
 	  b, rs_b, cs_b, mtag_b,
 	  ( int32_t* )c, rs_c, cs_c,
 	  alpha, beta,
-	  &rntm_g,
+	  &rntm_g, lcntx_g,
 	  post_op_list, TRUE
 	);
 #else
@@ -161,7 +164,7 @@ AOCL_GEMM_MATMUL(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
 	  b, rs_b, cs_b, mtag_b,
 	  ( int32_t* )c, rs_c, cs_c,
 	  alpha, beta,
-	  &rntm_g,
+	  &rntm_g, lcntx_g,
 	  post_op_list, TRUE
 	);
 #endif
diff --git a/addon/aocl_gemm/aocl_util_interface_apis.h b/addon/aocl_gemm/aocl_util_interface_apis.h
new file mode 100644
index 0000000000..d2983b8a64
--- /dev/null
+++ b/addon/aocl_gemm/aocl_util_interface_apis.h
@@ -0,0 +1,50 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef AOCL_UTIL_INTERFACE_H
+#define AOCL_UTIL_INTERFACE_H
+
+#define AOCL_UTIL_L1_OP(V_type,OP_type) \
+BLIS_EXPORT_ADDON void aocl_ ## OP_type \
+     ( \
+       const dim_t n, \
+       V_type*     x, \
+       const inc_t incx \
+     ) \
+
+AOCL_UTIL_L1_OP(float,gelu_tanh_f32);
+AOCL_UTIL_L1_OP(float,gelu_erf_f32);
+AOCL_UTIL_L1_OP(float,softmax_f32);
+
+#endif //AOCL_UTIL_INTERFACE_H
diff --git a/addon/aocl_gemm/aocl_util_l1_ops.c b/addon/aocl_gemm/aocl_util_l1_ops.c
new file mode 100644
index 0000000000..11a4b83078
--- /dev/null
+++ b/addon/aocl_gemm/aocl_util_l1_ops.c
@@ -0,0 +1,114 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "aocl_util_interface_apis.h"
+#include "lpgemm_types.h"
+#include "lpgemm_config.h"
+#include "lpgemm_utils_kernels.h"
+
+AOCL_UTIL_L1_OP(float,gelu_tanh_f32)
+{
+	// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
+	{
+		bli_print_msg(" AVX2 ISA not supported by processor, AOCL GEMM "
+					"utility l1 operations not supported.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	if ( ( n <= 0 ) || ( x == NULL ) || ( incx <= 0 ) )
+	{
+		return; // Error.
+	}
+
+	lpgemm_util_cntx_t* lutil_cntx_g = lpgemm_util_get_global_cntx_obj( F32_GELU_TANH );
+	( ( lpgemm_util_l1_op_f32_kernel_t )lutil_cntx_g->kern_fun_ptr )( n, x, incx );
+}
+
+AOCL_UTIL_L1_OP(float,gelu_erf_f32)
+{
+	// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
+	{
+		bli_print_msg(" AVX2 ISA not supported by processor, AOCL GEMM "
+					"utility l1 operations not supported.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	if ( ( n <= 0 ) || ( x == NULL ) || ( incx <= 0 ) )
+	{
+		return; // Error.
+	}
+
+	lpgemm_util_cntx_t* lutil_cntx_g = lpgemm_util_get_global_cntx_obj( F32_GELU_ERF );
+	( ( lpgemm_util_l1_op_f32_kernel_t )lutil_cntx_g->kern_fun_ptr )( n, x, incx );
+}
+
+AOCL_UTIL_L1_OP(float,softmax_f32)
+{
+	// Check if AVX2 ISA is supported, lpgemm u8s8s16os16 matmul only works with it.
+	if ( bli_cpuid_is_avx2fma3_supported() == FALSE )
+	{
+		bli_print_msg(" AVX2 ISA not supported by processor, AOCL GEMM "
+					"utility l1 operations not supported.", __FILE__, __LINE__ );
+		return; // Error.
+	}
+
+	/* Initialize BLIS. */
+	bli_init_auto();
+
+	// Set MC, NC, KC, NR, MR.
+	aocl_lpgemm_init_global_cntx();
+
+	if ( ( n <= 0 ) || ( x == NULL ) || ( incx <= 0 ) )
+	{
+		return; // Error.
+	}
+
+	lpgemm_util_cntx_t* lutil_cntx_g = lpgemm_util_get_global_cntx_obj( F32_SOFTMAX );
+	( ( lpgemm_util_l1_op_f32_kernel_t )lutil_cntx_g->kern_fun_ptr )( n, x, incx );
+}
diff --git a/addon/aocl_gemm/config/lpgemm_blksz_map.h b/addon/aocl_gemm/config/lpgemm_blksz_map.h
new file mode 100644
index 0000000000..9991a3eb70
--- /dev/null
+++ b/addon/aocl_gemm/config/lpgemm_blksz_map.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_BLKSZ_MAP_H
+#define LPGEMM_BLKSZ_MAP_H
+
+// The XMACRO follows the format ID,MC,NC,KC,MR,NR,PACKA_RS,PACKA_CS,PACKB_RS,PACKB_CS:
+// ID = One of the AOCL_OPERATION_TYPE enum.
+
+#define LPGEMM_BLKSZ_MAP_ZEN4 \
+	XMACRO(U8S8S16OS16, 252, 2048, 2048, 6, 32, 0, 0, 2*32, 32) \
+	XMACRO(U8S8S32OS32, 144, 1024, 2048, 6, 64, 4, 24, 4*64, 64) \
+	XMACRO(BF16BF16F32OF32, 144, 1024, 2048, 6, 64, 0, 0, 2*64, 64/2) \
+	XMACRO(S8S8S32OS32, 144, 1024, 2048, 6, 64, 4, 24, 4*64, 64) \
+  XMACRO(S8S8S16OS16, 252, 2048, 2048, 6, 32, 0, 0, 2*32, 32) \
+
+#define LPGEMM_BLKSZ_MAP_ZEN \
+	XMACRO(U8S8S16OS16, 252, 2048, 2048, 6, 32, 0, 0, 2*32, 32) \
+	XMACRO(U8S8S32OS32, 144, 1024, 2048, 6, 64, 4, 24, 4*64, 64) \
+	XMACRO(BF16BF16F32OF32, 144, 1024, 2048, 6, 64, 0, 0, 2*64, 64/2) \
+	XMACRO(S8S8S32OS32, 144, 1024, 2048, 6, 64, 4, 24, 4*64, 64) \
+  XMACRO(S8S8S16OS16, 252, 2048, 2048, 6, 32, 0, 0, 2*32, 32) \
+
+#endif //LPGEMM_BLKSZ_MAP_H
diff --git a/addon/aocl_gemm/config/lpgemm_config.c b/addon/aocl_gemm/config/lpgemm_config.c
new file mode 100644
index 0000000000..0dad8c88a7
--- /dev/null
+++ b/addon/aocl_gemm/config/lpgemm_config.c
@@ -0,0 +1,297 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "lpgemm_config.h"
+#include "lpgemm_func_map.h"
+#include "lpgemm_blksz_map.h"
+#include "lpgemm_kernels.h"
+#include "lpgemm_packb_bf16.h"
+#include "lpgemm_packb_s16.h"
+#include "lpgemm_packa.h"
+#include "lpgemm_packb.h"
+#include "lpgemm_packa_s8.h"
+#include "lpgemm_packb_s8.h"
+#include "lpgemm_packb_s8s16.h"
+
+static lpgemm_cntx_t global_cntx_t_list[AOCL_OPERATION_TYPE_LEN] \
+					__attribute__((aligned(64))); //Only one op type supported now.
+static lpgemm_util_cntx_t global_util_cntx_t_list[AOCL_UTIL_OPERATION_TYPE_LEN] \
+					__attribute__((aligned(64))); //Only post-ops like utils.
+
+static bli_pthread_once_t once_check_lpgemm_func_map_init = BLIS_PTHREAD_ONCE_INIT;
+
+static void _lpgemm_util_cntx_init_func_map()
+{
+#define UMACRO(ID,FUNC_PTR) global_util_cntx_t_list[ID].kern_fun_ptr = FUNC_PTR;
+
+	global_util_cntx_t_list[F32_GELU_TANH].kern_fun_ptr = NULL;
+	global_util_cntx_t_list[F32_GELU_ERF].kern_fun_ptr = NULL;
+
+	// Kernel dispatch object factory.
+	if ( bli_cpuid_is_avx512bf16_supported() == TRUE )
+	{
+#ifdef BLIS_KERNELS_ZEN4
+		LPGEMM_UTIL_KERN_FUNC_MAP_AVX512_VNNI_BF16
+#endif
+	}
+	else if ( bli_cpuid_is_avx512vnni_supported() == TRUE )
+	{
+#ifdef BLIS_KERNELS_ZEN4
+		LPGEMM_UTIL_KERN_FUNC_MAP_AVX512_VNNI
+#endif
+	}
+	else if ( bli_cpuid_is_avx2fma3_supported() == TRUE )
+	{
+#ifdef BLIS_KERNELS_ZEN3
+		LPGEMM_UTIL_KERN_FUNC_MAP_AVX2
+#endif
+	}
+
+#undef UMACRO
+}
+
+static void _lpgemm_cntx_init_func_map()
+{
+#define KMACRO(ID,FUNC_PTR) global_cntx_t_list[ID].kern_fun_ptr = FUNC_PTR;
+#define PAMACRO(ID,FUNC_PTR) global_cntx_t_list[ID].packa_fun_ptr = FUNC_PTR;
+#define PBMACRO(ID,FUNC_PTR) global_cntx_t_list[ID].packb_fun_ptr = FUNC_PTR;
+
+	//TODO: Default initialize with reference kernels so that kernel pointer
+	// will be valid even in case none of the zen optimized kernels are
+	// available. This scenario could happen if the addon was built using
+	// a different arch config (eg: skx).
+
+	global_cntx_t_list[U8S8S16OS16].kern_fun_ptr = NULL;
+	global_cntx_t_list[U8S8S32OS32].kern_fun_ptr = NULL;
+	global_cntx_t_list[F32F32F32OF32].kern_fun_ptr = NULL;
+	global_cntx_t_list[BF16BF16F32OF32].kern_fun_ptr = NULL;
+
+	// Kernel dispatch object factory.
+	if ( bli_cpuid_is_avx512bf16_supported() == TRUE )
+	{
+#ifdef BLIS_KERNELS_ZEN4
+		LPGEMM_KERN_FUNC_MAP_AVX512_VNNI_BF16
+		LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI_BF16
+		LPGEMM_PACKB_FUNC_MAP_AVX512_VNNI_BF16
+#endif
+	}
+	else if ( bli_cpuid_is_avx512vnni_supported() == TRUE )
+	{
+#ifdef BLIS_KERNELS_ZEN4
+		LPGEMM_KERN_FUNC_MAP_AVX512_VNNI
+		LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI
+		LPGEMM_PACKB_FUNC_MAP_AVX512_VNNI
+#endif
+	}
+	else if ( bli_cpuid_is_avx2fma3_supported() == TRUE )
+	{
+#ifdef BLIS_KERNELS_ZEN3
+		LPGEMM_KERN_FUNC_MAP_AVX2
+		LPGEMM_PACKA_FUNC_MAP_AVX2
+		LPGEMM_PACKB_FUNC_MAP_AVX2
+#endif
+	}
+	// If built with a config not supporting zen3/zen4/amdzen, error out
+	// since reference kernels are not available.
+	if ( global_cntx_t_list[F32F32F32OF32].kern_fun_ptr == NULL )
+	{
+		bli_print_msg( "AOCL_GEMM is not compiled using correct Zen config."
+				" Compile using zen3/zen4/amdzen config.",
+				__FILE__, __LINE__ );
+		bli_abort();
+	}
+
+#undef PBMACRO
+#undef PAMACRO
+#undef KMACRO
+}
+
+BLIS_INLINE void lpgemm_set_block_sizes_global_cntx
+     (
+       AOCL_OPERATION_TYPE op_type,
+       dim_t MC,
+       dim_t NC,
+       dim_t KC,
+       dim_t MR,
+       dim_t NR
+     )
+{
+	global_cntx_t_list[op_type].blksz.MC = MC;
+	global_cntx_t_list[op_type].blksz.NC = NC;
+	global_cntx_t_list[op_type].blksz.KC = KC;
+	global_cntx_t_list[op_type].blksz.MR = MR;
+	global_cntx_t_list[op_type].blksz.NR = NR;
+}
+
+BLIS_INLINE void lpgemm_set_pack_strides_global_cntx
+     (
+       AOCL_OPERATION_TYPE op_type,
+       dim_t packa_rs,
+       dim_t packa_cs,
+       dim_t packb_rs,
+       dim_t packb_cs
+     )
+{
+	global_cntx_t_list[op_type].pack_s.packa_rs = packa_rs;
+	global_cntx_t_list[op_type].pack_s.packa_cs = packa_cs;
+	global_cntx_t_list[op_type].pack_s.packb_rs = packb_rs;
+	global_cntx_t_list[op_type].pack_s.packb_cs = packb_cs;
+}
+
+static void _lpgemm_cntx_init_blksz_map()
+{
+#define XMACRO(ID,MC,NC,KC,MR,NR,PACKA_RS,PACKA_CS,PACKB_RS,PACKB_CS) \
+	lpgemm_set_block_sizes_global_cntx(ID, MC, NC, KC, MR, NR); \
+	lpgemm_set_pack_strides_global_cntx(ID, PACKA_RS, PACKA_CS, PACKB_RS, PACKB_CS);
+
+	// Ideally the blocksize needs to be set based on arch id. However
+	// since this code is also expected to work on other vendor machines,
+	// the blocksize for a particular version of zen id is generalized
+	// for all machines that support the ISA supported by that particular
+	// zen id.
+	if ( bli_cpuid_is_avx512vnni_supported() == TRUE )
+	{
+		LPGEMM_BLKSZ_MAP_ZEN4
+	}
+	else if ( bli_cpuid_is_avx2fma3_supported() == TRUE )
+	{
+		LPGEMM_BLKSZ_MAP_ZEN
+	}
+	else
+	{
+		LPGEMM_BLKSZ_MAP_ZEN
+	}
+
+#undef XMACRO
+}
+
+static void lpgemm_cntx_init_map()
+{
+	_lpgemm_cntx_init_func_map();
+	_lpgemm_cntx_init_blksz_map();
+	_lpgemm_util_cntx_init_func_map();
+}
+
+// Sets default block sizes for lpgemm. Currently only u8s8s32 supported.
+void aocl_lpgemm_init_global_cntx()
+{
+	bli_pthread_once
+	(
+	  &once_check_lpgemm_func_map_init,
+	  lpgemm_cntx_init_map
+	);
+}
+
+lpgemm_cntx_t* lpgemm_get_global_cntx_obj( AOCL_OPERATION_TYPE op )
+{
+	return &global_cntx_t_list[op];
+}
+
+lpgemm_util_cntx_t* lpgemm_util_get_global_cntx_obj( AOCL_UTIL_OPERATION_TYPE op )
+{
+	return &global_util_cntx_t_list[op];
+}
+
+dim_t lpgemm_get_block_size_MC_global_cntx( AOCL_OPERATION_TYPE op_type )
+{
+	return global_cntx_t_list[op_type].blksz.MC;
+}
+
+dim_t lpgemm_get_block_size_NC_global_cntx( AOCL_OPERATION_TYPE op_type )
+{
+	return global_cntx_t_list[op_type].blksz.NC;
+}
+
+dim_t lpgemm_get_block_size_KC_global_cntx( AOCL_OPERATION_TYPE op_type )
+{
+	return global_cntx_t_list[op_type].blksz.KC;
+}
+
+dim_t lpgemm_get_block_size_NR_global_cntx( AOCL_OPERATION_TYPE op_type )
+{
+	return global_cntx_t_list[op_type].blksz.NR;
+}
+
+dim_t lpgemm_get_block_size_MR_global_cntx( AOCL_OPERATION_TYPE op_type )
+{
+	return global_cntx_t_list[op_type].blksz.MR;
+}
+
+void lpgemm_get_packa_strides( lpgemm_cntx_t* lcntx, dim_t* rs, dim_t* cs )
+{
+	*rs = lcntx->pack_s.packa_rs;
+	*cs = lcntx->pack_s.packa_cs;
+}
+
+void lpgemm_get_packb_strides( lpgemm_cntx_t* lcntx, dim_t* rs, dim_t* cs )
+{
+	*rs = lcntx->pack_s.packb_rs;
+	*cs = lcntx->pack_s.packb_cs;
+}
+
+void lpgemm_mod_block_size_s16
+     (
+       dim_t m,
+       dim_t n,
+       dim_t k,
+       dim_t* MC,
+       dim_t* NC,
+       dim_t* KC
+     )
+{
+	const dim_t range[4] = {1024, 512, 256, 128};
+
+	if (n < *NC)
+	{
+		for (dim_t i = 0; i < 4; ++i)
+		{
+			if (n <= range[i])
+			{
+				*NC = range[i];
+			}
+		}
+	}
+
+	if (k < *KC)
+	{
+		for (dim_t i = 0; i < 4; ++i)
+		{
+			if (k <= range[i])
+			{
+				*KC = range[i];
+			}
+		}
+	}
+}
diff --git a/addon/aocl_gemm/frame/lpgemm_config.h b/addon/aocl_gemm/config/lpgemm_config.h
similarity index 75%
rename from addon/aocl_gemm/frame/lpgemm_config.h
rename to addon/aocl_gemm/config/lpgemm_config.h
index 7e7f3bb2ad..91863e416a 100644
--- a/addon/aocl_gemm/frame/lpgemm_config.h
+++ b/addon/aocl_gemm/config/lpgemm_config.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -38,10 +38,15 @@
 #include "lpgemm_types.h"
 
 // equals to number of ops in enum AOCL_OPERATION_TYPE.
-extern lpgemm_cntx_t lpgemm_global_cntx_t_list[4];
+extern lpgemm_cntx_t lpgemm_global_cntx_t_list[AOCL_OPERATION_TYPE_LEN];
+extern lpgemm_cntx_t lpgemm_util_global_cntx_t_list[AOCL_UTIL_OPERATION_TYPE_LEN];
 
 void aocl_lpgemm_init_global_cntx();
 
+lpgemm_cntx_t* lpgemm_get_global_cntx_obj( AOCL_OPERATION_TYPE op );
+
+lpgemm_util_cntx_t* lpgemm_util_get_global_cntx_obj( AOCL_UTIL_OPERATION_TYPE op );
+
 dim_t lpgemm_get_block_size_MC_global_cntx( AOCL_OPERATION_TYPE op_type );
 
 dim_t lpgemm_get_block_size_NC_global_cntx( AOCL_OPERATION_TYPE op_type );
@@ -52,4 +57,18 @@ dim_t lpgemm_get_block_size_NR_global_cntx( AOCL_OPERATION_TYPE op_type );
 
 dim_t lpgemm_get_block_size_MR_global_cntx( AOCL_OPERATION_TYPE op_type );
 
+void lpgemm_get_packa_strides( lpgemm_cntx_t* lcntx, dim_t* rs, dim_t* cs );
+
+void lpgemm_get_packb_strides( lpgemm_cntx_t* lcntx, dim_t* rs, dim_t* cs );
+
+void lpgemm_mod_block_size_s16
+     (
+       dim_t  m,
+       dim_t  n,
+       dim_t  k,
+       dim_t* MC,
+       dim_t* NC,
+       dim_t* KC
+     );
+
 #endif //LPGEMM_CONFIG_H
diff --git a/addon/aocl_gemm/config/lpgemm_func_map.h b/addon/aocl_gemm/config/lpgemm_func_map.h
new file mode 100644
index 0000000000..864f84aef2
--- /dev/null
+++ b/addon/aocl_gemm/config/lpgemm_func_map.h
@@ -0,0 +1,159 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_FUNC_MAP_H
+#define LPGEMM_FUNC_MAP_H
+
+// The XMACRO follows the format ID,FUNC_PTR:
+// ID = One of the AOCL_OPERATION_TYPE enum.
+// FUNC_PTR = Kernel associated with the AOCL_OPERATION_TYPE.
+// It is to be noted that the main macros are defined for combinations
+// of ISA types, and in case a kernel is not implemented for a particualr
+// ISA combination, the reference kernel should be set as FUNC_PTR.
+// TODO: Add reference kernels for BF16/VNNI kernels for ISA combinations
+// that is not supported.
+
+// Genoa
+#define LPGEMM_KERN_FUNC_MAP_AVX512_VNNI_BF16 \
+	KMACRO(U8S8S16OS16, lpgemm_rowvar_u8s8s16o16_6x32) \
+	KMACRO(U8S8S32OS32, lpgemm_rowvar_u8s8s32o32_6x64) \
+	KMACRO(F32F32F32OF32, lpgemm_rowvar_f32f32f32of32_avx512_6x64m) \
+	KMACRO(BF16BF16F32OF32, lpgemm_rowvar_bf16bf16f32of32_6x64) \
+	KMACRO(S8S8S32OS32, lpgemm_rowvar_s8s8s32os32_6x64) \
+	KMACRO(S8S8S16OS16, lpgemm_rowvar_s8s8s16o16_6x32) \
+
+#define LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI_BF16 \
+	PAMACRO(U8S8S16OS16, NULL) \
+	PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
+	PAMACRO(BF16BF16F32OF32, NULL) \
+	PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
+	PAMACRO(S8S8S16OS16, NULL) \
+
+#define LPGEMM_PACKB_FUNC_MAP_AVX512_VNNI_BF16 \
+	PBMACRO(U8S8S16OS16, packb_nr32_u8s8s16o16) \
+	PBMACRO(U8S8S32OS32, packb_nr64_u8s8s32o32) \
+	PBMACRO(BF16BF16F32OF32, packb_nr64_bf16bf16f32of32) \
+	PBMACRO(S8S8S32OS32, packb_nr64_s8s8s32os32) \
+	PBMACRO(S8S8S16OS16, packb_nr32_s8s8s16o16) \
+
+#define LPGEMM_UTIL_KERN_FUNC_MAP_AVX512_VNNI_BF16 \
+	UMACRO(F32_GELU_TANH, lpgemm_util_f32_gelu_tanh_avx512_kernel) \
+	UMACRO(F32_GELU_ERF, lpgemm_util_f32_gelu_erf_avx512_kernel) \
+	UMACRO(F32_SOFTMAX, lpgemm_util_f32_softmax_avx512_kernel) \
+
+// Icelake
+#define LPGEMM_KERN_FUNC_MAP_AVX512_VNNI \
+	KMACRO(U8S8S16OS16, lpgemm_rowvar_u8s8s16o16_6x32) \
+	KMACRO(U8S8S32OS32, lpgemm_rowvar_u8s8s32o32_6x64) \
+	KMACRO(F32F32F32OF32, lpgemm_rowvar_f32f32f32of32_avx512_6x64m) \
+	KMACRO(BF16BF16F32OF32, lpgemm_rowvar_bf16bf16f32of32_6x64) \
+	KMACRO(S8S8S32OS32, lpgemm_rowvar_s8s8s32os32_6x64) \
+	KMACRO(S8S8S16OS16, lpgemm_rowvar_s8s8s16o16_6x32) \
+
+#define LPGEMM_PACKA_FUNC_MAP_AVX512_VNNI \
+	PAMACRO(U8S8S16OS16, NULL) \
+	PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
+	PAMACRO(BF16BF16F32OF32, NULL) \
+	PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
+	PAMACRO(S8S8S16OS16, NULL) \
+
+#define LPGEMM_PACKB_FUNC_MAP_AVX512_VNNI \
+	PBMACRO(U8S8S16OS16, packb_nr32_u8s8s16o16) \
+	PBMACRO(U8S8S32OS32, packb_nr64_u8s8s32o32) \
+	PBMACRO(BF16BF16F32OF32, packb_nr64_bf16bf16f32of32) \
+	PBMACRO(S8S8S32OS32, packb_nr64_s8s8s32os32) \
+	PBMACRO(S8S8S16OS16, packb_nr32_s8s8s16o16) \
+
+#define LPGEMM_UTIL_KERN_FUNC_MAP_AVX512_VNNI \
+	UMACRO(F32_GELU_TANH, lpgemm_util_f32_gelu_tanh_avx512_kernel) \
+	UMACRO(F32_GELU_ERF, lpgemm_util_f32_gelu_erf_avx512_kernel) \
+	UMACRO(F32_SOFTMAX, lpgemm_util_f32_softmax_avx512_kernel) \
+
+// Skylake
+#define LPGEMM_KERN_FUNC_MAP_AVX512 \
+	KMACRO(U8S8S16OS16, lpgemm_rowvar_u8s8s16o16_6x32) \
+	KMACRO(U8S8S32OS32, lpgemm_rowvar_u8s8s32o32_6x64) \
+	KMACRO(F32F32F32OF32, lpgemm_rowvar_f32f32f32of32_avx512_6x64m) \
+	KMACRO(BF16BF16F32OF32, lpgemm_rowvar_bf16bf16f32of32_6x64) \
+	KMACRO(S8S8S32OS32, lpgemm_rowvar_s8s8s32os32_6x64) \
+	KMACRO(S8S8S16OS16, lpgemm_rowvar_s8s8s16o16_6x32) \
+
+#define LPGEMM_PACKA_FUNC_MAP_AVX512 \
+	PAMACRO(U8S8S16OS16, NULL) \
+	PAMACRO(U8S8S32OS32, packa_k64_u8s8s32o32) \
+	PAMACRO(BF16BF16F32OF32, NULL) \
+	PAMACRO(S8S8S32OS32, packa_k64_s8s8s32os32) \
+	PAMACRO(S8S8S16OS16, NULL) \
+
+#define LPGEMM_PACKB_FUNC_MAP_AVX512 \
+	PBMACRO(U8S8S16OS16, packb_nr32_u8s8s16o16) \
+	PBMACRO(U8S8S32OS32, packb_nr64_u8s8s32o32) \
+	PBMACRO(BF16BF16F32OF32, packb_nr64_bf16bf16f32of32) \
+	PBMACRO(S8S8S32OS32, packb_nr64_s8s8s32os32) \
+	PBMACRO(S8S8S16OS16, packb_nr32_s8s8s16o16) \
+
+#define LPGEMM_UTIL_KERN_FUNC_MAP_AVX512 \
+	UMACRO(F32_GELU_TANH, lpgemm_util_f32_gelu_tanh_avx512_kernel) \
+	UMACRO(F32_GELU_ERF, lpgemm_util_f32_gelu_erf_avx512_kernel) \
+	UMACRO(F32_SOFTMAX, lpgemm_util_f32_softmax_avx512_kernel) \
+
+// Milan, Haswell
+#define LPGEMM_KERN_FUNC_MAP_AVX2 \
+	KMACRO(U8S8S16OS16, lpgemm_rowvar_u8s8s16o16_6x32) \
+	KMACRO(U8S8S32OS32, NULL) \
+	KMACRO(F32F32F32OF32, lpgemm_rowvar_f32f32f32of32_6x16m) \
+	KMACRO(BF16BF16F32OF32, NULL) \
+	KMACRO(S8S8S32OS32, NULL) \
+	KMACRO(S8S8S16OS16, lpgemm_rowvar_s8s8s16o16_6x32) \
+
+#define LPGEMM_PACKA_FUNC_MAP_AVX2 \
+	PAMACRO(U8S8S16OS16, NULL) \
+	PAMACRO(U8S8S32OS32, NULL) \
+	PAMACRO(BF16BF16F32OF32, NULL) \
+	PAMACRO(S8S8S32OS32, NULL) \
+	PAMACRO(S8S8S16OS16, NULL) \
+
+#define LPGEMM_PACKB_FUNC_MAP_AVX2 \
+	PBMACRO(U8S8S16OS16, packb_nr32_u8s8s16o16) \
+	PBMACRO(U8S8S32OS32, NULL) \
+	PBMACRO(BF16BF16F32OF32, NULL) \
+	PBMACRO(S8S8S32OS32, NULL) \
+	PBMACRO(S8S8S16OS16, packb_nr32_s8s8s16o16) \
+
+#define LPGEMM_UTIL_KERN_FUNC_MAP_AVX2 \
+	UMACRO(F32_GELU_TANH, lpgemm_util_f32_gelu_tanh_avx2_kernel) \
+	UMACRO(F32_GELU_ERF, lpgemm_util_f32_gelu_erf_avx2_kernel) \
+	UMACRO(F32_SOFTMAX, lpgemm_util_f32_softmax_avx2_kernel) \
+
+#endif //LPGEMM_FUNC_MAP_H
diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c
index 5db523f987..1ece1db727 100644
--- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c
+++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_bf16.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,14 +40,36 @@
 #include "lpgemm_thrinfo_utils.h"
 #include "lpgemm_config.h"
 
+// Kernel function prototypes
+typedef void (*lpgemm_rowvar_bf16)
+     (
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const bfloat16*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const bfloat16*,
+       const dim_t,
+       const dim_t,
+       float*,
+       const dim_t,
+       const dim_t,
+       const float,
+       const float,
+       lpgemm_post_op*,
+       lpgemm_post_op_attr
+     );
+
 // B should always be packed.
 LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 {
-	dim_t NC = lpgemm_get_block_size_NC_global_cntx( BF16BF16F32OF32 );
-	dim_t KC = lpgemm_get_block_size_KC_global_cntx( BF16BF16F32OF32 );
-	dim_t MC = lpgemm_get_block_size_MC_global_cntx( BF16BF16F32OF32 );
-	dim_t NR = lpgemm_get_block_size_NR_global_cntx( BF16BF16F32OF32 );
-	dim_t MR = lpgemm_get_block_size_MR_global_cntx( BF16BF16F32OF32 );
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t MC = lcntx->blksz.MC;
+	dim_t NR = lcntx->blksz.NR;
+	dim_t MR = lcntx->blksz.MR;
 
 	const int16_t* a_use = NULL;
 	dim_t cs_a_use = cs_a;
@@ -80,9 +102,22 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 	dim_t k_updated = k;
 	k_updated += (k_updated & 0x1);
 
-	// Is required to decide whether to apply post ops or not.
+	// To decide whether to apply post ops or not.
 	bool is_last_k = FALSE;
 
+	// To decide whether to use original s8 C or temp buffer for beta scale.
+	bool is_first_k = FALSE;
+
+	lpgemm_post_op_attr post_ops_attr;
+	if ( c_downscale == TRUE )
+	{
+		post_ops_attr.buf_downscale = c;
+	}
+	else
+	{
+		post_ops_attr.buf_downscale = NULL;
+	}
+
 	// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
 	thrinfo_t thread_jc;
 	thrinfo_t thread_ic;
@@ -102,7 +137,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 
 		dim_t jc_cur_loop = jc;
 		dim_t jc_cur_loop_rem = 0;
-		dim_t n_sub_updated;
+		dim_t n_sub_updated = 0;
 
 		if ( mtag_b == REORDERED )
 		{
@@ -121,45 +156,24 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 		// Temp accumulaton buffer for C allocation.
 		else if ( c_downscale == TRUE )
 		{
-			mem_scale_c_size_req = sizeof( float ) * nc0 * ( ic_end - ic_start );
-
-			lpgemm_alloc_mem_panel
-			(
-			  mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
-			  &mem_scale_c, rntm
-			);
+			// Buffer memory is only required if output needs to be
+			// persisted across iterations of the pc/KC loop.
+			// It was observed that the locks used while checking out
+			// a buffer from memory pool had an impact on performance
+			// and is better to not checkout if k <= KC.
+			if ( k > KC )
+			{
+				mem_scale_c_size_req = sizeof( float ) * nc0 * ( ic_end - ic_start );
 
-			temp_scal_c_buffer_bf16 = bli_mem_buffer( &mem_scale_c );
+				lpgemm_alloc_mem_panel
+				(
+			  	 mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
+			  	 &mem_scale_c, rntm
+				);
 
-			c_use_jc = ( float* )temp_scal_c_buffer_bf16;
+				temp_scal_c_buffer_bf16 = bli_mem_buffer( &mem_scale_c );
 
-			if ( beta != 0 )
-			{
-				dim_t i_temp = 0;
-				dim_t j_temp = 0;
-				int32_t temp_conv_buf = 0;
-				// Upscale out C to temporary C matrix.
-				for ( dim_t i_dscale = ic_start; i_dscale < ic_end; ++i_dscale )
-				{
-					j_temp = 0;
-					for ( dim_t j_dscale = jc; j_dscale < ( jc + nc0 ); ++j_dscale )
-					{
-						// Implemented with the idea sizeof(float)=4.
-						temp_conv_buf = 0;
-						temp_conv_buf = *( ( int16_t* )( ( bfloat16* )c +
-										( rs_c * i_dscale ) + j_dscale ) );
-
-						// Add 16 bits in the fractional part.
-						temp_conv_buf = temp_conv_buf << 16;
-
-						// Store the bytes in float format.
-						*( temp_scal_c_buffer_bf16 + ( nc0 * i_temp ) + j_temp )
-								= *( ( float* )( &temp_conv_buf ) );
-
-						j_temp++;
-					}
-					i_temp++;
-				}
+				c_use_jc = ( float* )temp_scal_c_buffer_bf16;
 			}
 
 			// The temp c buffer stride is modified as opposed to original C matrix.
@@ -171,6 +185,13 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 			float beta0 = ( pc == 0 ) ? beta : 1;
 			dim_t kc0 = bli_min( ( k - pc ), KC );
 
+			// No parallelization in k dim, k always starts at 0.
+			is_first_k = ( pc == 0 ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_first_k = is_first_k;
+
+			is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_last_k = is_last_k;
+
 			// kc0 needs to be a multiple of 2 so that it can be
 			// used with dpbf16_ps instruction. Padding is added in
 			// cases this condition is not satisfied, and therefore
@@ -179,8 +200,6 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 			dim_t kc0_updated = kc0;
 			kc0_updated += (kc0_updated & 0x1);
 
-			is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
-
 			if ( mtag_b == PACK )
 			{
 				// Pack B chunks are based on jc work id.
@@ -235,8 +254,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 				if ( ( jc_packb_end > jc_packb_start ) &&
 					 ( jc_packb_start < ( jc + nc0 ) ) )
 				{
-#ifdef BLIS_KERNELS_ZEN4
-					packb_nr64_bf16bf16f32of32
+					( ( packb_bf16 )lcntx->packb_fun_ptr )
 					(
 					  pack_b_buffer_bf16 + ( jc_packb_start * kc0_updated ),
 					  ( b + ( rs_b * pc ) + ( cs_b * jc ) +
@@ -244,11 +262,10 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 					  ( jc_packb_end - jc_packb_start ), kc0,
 					  &rs_b_use, &cs_b_use
 					);
-#endif
 				}
 				else
 				{
-					get_packb_nr64_bf16bf16f32of32_strides( &rs_b_use, &cs_b_use );
+					lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
 				}
 
 				// All threads in work group should wait till B matrix packing
@@ -271,7 +288,7 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 						( n_sub_updated * pc ) +
 						( jc_cur_loop_rem * kc0_updated );
 
-				get_packb_nr64_bf16bf16f32of32_strides( &rs_b_use, &cs_b_use );
+				lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
 			}
 
 			for ( dim_t ic = ic_start; ic < ic_end; ic += MC )
@@ -304,30 +321,21 @@ LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32)
 				{
 					dim_t nr0 = bli_min( ( nc0 - jr ), NR );
 
-#ifdef BLIS_KERNELS_ZEN4
+					// Post ops meta attributes.
+					post_ops_attr.post_op_c_i = ic;
+					post_ops_attr.post_op_c_j = ( jc + jr );
+					post_ops_attr.rs_c_downscale = rs_c_downscale;
+
 					// Reorder/Packed B, Reorder/Packed/Unpacked A call.
-					lpgemm_rowvar_bf16bf16f32of32_6x64 
+					( ( lpgemm_rowvar_bf16 )lcntx->kern_fun_ptr )
 					(
 					  mc0, nr0, kc0,
 					  a_use, rs_a, cs_a_use, a_block_stride,
 					  ( b_use + ( jr * kc0_updated ) ), rs_b_use, cs_b_use,
 					  ( c_use_ic + jr ), rs_c_use, 1,
 					  alpha, beta0,
-					  is_last_k, ic, ( jc + jr ), post_op_list, rs_c_downscale
+					  post_op_list, post_ops_attr
 					);
-#else
-					// Silence compiler warnings.
-					( void )b_use;
-					( void )a_block_stride;
-					( void )rs_c_downscale;
-					( void )is_last_k;
-					( void )c_use_ic;
-					( void )a_use;
-					( void )beta0;
-					( void )nr0;
-					( void )mc0;
-					( void )cs_a_use;
-#endif
 				}
 			}
 		}
diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c
index 5bb217facd..b90d339664 100644
--- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c
+++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.c
@@ -1,180 +1,169 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "lpgemm_utils.h"
-#include "lpgemm_reorder_bf16.h"
-#include "lpgemm_packb_bf16.h"
-#include "lpgemm_config.h"
-#include "aocl_bf16_type.h"
-
-void reorderb_nr64_bf16bf16f32of32
-  (
-    lpgemm_obj_t *b,
-    lpgemm_obj_t *b_reorder
-  )
-{   
-	dim_t NC = lpgemm_get_block_size_NC_global_cntx( BF16BF16F32OF32 );
-	dim_t NR = lpgemm_get_block_size_NR_global_cntx( BF16BF16F32OF32 );
-	dim_t KC = lpgemm_get_block_size_KC_global_cntx( BF16BF16F32OF32 );
-
-	// Extracting the matrix properties from the lpgemm object
-	dim_t rs_b = b->rs;
-	dim_t n = b->width;
-	dim_t k = b->length;
-
-	dim_t rs_b_reorder;
-	dim_t cs_b_reorder;
-
-	// k needs to be a multiple of 2 so that it can be used with dpbf
-	// instruction. Padding is added in cases this condition is not
-	// satisfied, and therefore the k offset used for packed/reordered
-	// buffer needs to be updated.
-	dim_t k_updated = k;
-	k_updated += (k_updated & 0x1);
-
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_g;
-	bli_rntm_init_from_global( &rntm_g );
-
-	dim_t n_threads = bli_rntm_num_threads( &rntm_g );
-	n_threads = ( n_threads > 0 ) ? n_threads : 1;
-
-#ifdef BLIS_ENABLE_OPENMP
-	_Pragma( "omp parallel num_threads(n_threads)" )
-	{
-		// Initialise a local thrinfo obj for work split across threads.
-		thrinfo_t thread_jc;
-		bli_thrinfo_set_n_way( n_threads, &thread_jc );
-		bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
-#else
-	{
-		// Initialise a local thrinfo obj for work split across threads.
-		thrinfo_t thread_jc;
-		bli_thrinfo_set_n_way( 1, &thread_jc );
-		bli_thrinfo_set_work_id( 0, &thread_jc );
-#endif
-		// Compute the JC loop thread range for the current thread.
-		dim_t jc_start, jc_end;
-		bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
-
-		for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
-		{
-			dim_t nc0 = bli_min( ( jc_end - jc ), NC );
-
-			dim_t jc_cur_loop = jc;
-			dim_t jc_cur_loop_rem = 0;
-			dim_t n_sub_updated;
-
-			get_B_panel_reordered_start_offset_width
-			(
-			  jc, n, NC, 16,
-			  &jc_cur_loop, &jc_cur_loop_rem,
-			  &nc0, &n_sub_updated
-			);
-
-			for ( dim_t pc = 0; pc < k; pc += KC )
-			{
-				dim_t kc0 = bli_min( ( k - pc ), KC );
-
-				// k needs to be a multiple of 2 so that it can be used with dpbf
-				// instruction. Padding is added in cases this condition is not
-				// satisfied, and therefore the k offset used for packed/reordered
-				// buffer needs to be updated.
-				dim_t kc0_updated = kc0;
-				kc0_updated += (kc0_updated & 0x1);
-
-				// The offsets are calculated in such a way that it resembles
-				// the reorder buffer traversal in single threaded reordering.
-				// The panel boundaries (KCxNC) remain as it is accessed in
-				// single thread, and as a consequence a thread with jc_start
-				// inside the panel cannot consider NC range for reorder. It
-				// has to work with NC' < NC, and the offset is calulated using
-				// prev NC panels spanning k dim + cur NC panel spaning pc loop
-				// cur iteration + (NC - NC') spanning current kc0 (<= KC).
-				//
-				//Eg: Consider the following reordered buffer diagram:
-				//          t1              t2
-				//          |               |
-				//          |           |..NC..|
-				//          |           |      |
-				//          |.NC. |.NC. |NC'|NC"
-				//     pc=0-+-----+-----+---+--+
-				//        KC|     |     |   |  |
-				//          |  1  |  3  |   5  |
-				//    pc=KC-+-----+-----+---st-+
-				//        KC|     |     |   |  |
-				//          |  2  |  4  | 6 | 7|
-				// pc=k=2KC-+-----+-----+---+--+
-				//          |jc=0 |jc=NC|jc=2NC|
-				//
-				// The numbers 1,2..6,7 denotes the order in which reordered
-				// KCxNC blocks are stored in memory, ie: block 1 followed by 2
-				// followed by 3, etc. Given two threads t1 and t2, and t2 needs
-				// to acces point st in the reorder buffer to write the data:
-				// The offset calulation logic will be:
-				// jc_cur_loop = 2NC, jc_cur_loop_rem = NC', pc = KC,
-				// n_sub_updated = NC, k = 2KC, kc0_updated = KC
-				//
-				// st = ( jc_cur_loop * k )    <traverse blocks 1,2,3,4>
-				//    + ( n_sub_updated * pc ) <traverse block 5>
-				//    + ( NC' * kc0_updated)   <traverse block 6>
-#ifdef BLIS_KERNELS_ZEN4
-				// B should always be packed.
-				packb_nr64_bf16bf16f32of32
-				(
-				  ( ( ( bfloat16* )b_reorder->storage.aligned_buffer ) +
-					( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
-					( jc_cur_loop_rem * kc0_updated ) ),
-				  ( ( ( bfloat16* )b->storage.aligned_buffer ) +
-					( rs_b * pc ) + jc ),
-				  rs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
-				);
-#else
-				// Silence compiler warnings.
-				rs_b_reorder = 0;
-				cs_b_reorder = 0;
-				( void )rs_b;
-#endif
-			}
-
-			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
-		}
-	}
-
-	b_reorder->rs = rs_b_reorder;
-	b_reorder->cs = cs_b_reorder;
-	b_reorder->mtag = REORDERED;
-}
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "lpgemm_utils.h"
+#include "lpgemm_reorder_bf16.h"
+#include "lpgemm_packb_bf16.h"
+#include "lpgemm_config.h"
+#include "aocl_bf16_type.h"
+
+void reorderb_nr64_bf16bf16f32of32
+     (
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
+     )
+{
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t NR = lcntx->blksz.NR;
+
+	// Extracting the matrix properties from the lpgemm object
+	dim_t rs_b = b->rs;
+	dim_t n = b->width;
+	dim_t k = b->length;
+
+	dim_t rs_b_reorder;
+	dim_t cs_b_reorder;
+
+	// k needs to be a multiple of 2 so that it can be used with dpbf
+	// instruction. Padding is added in cases this condition is not
+	// satisfied, and therefore the k offset used for packed/reordered
+	// buffer needs to be updated.
+	dim_t k_updated = k;
+	k_updated += (k_updated & 0x1);
+
+	dim_t n_threads = bli_rntm_num_threads( rntm );
+	n_threads = ( n_threads > 0 ) ? n_threads : 1;
+
+#ifdef BLIS_ENABLE_OPENMP
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( n_threads, &thread_jc );
+		bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
+#else
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( 1, &thread_jc );
+		bli_thrinfo_set_work_id( 0, &thread_jc );
+#endif
+		// Compute the JC loop thread range for the current thread.
+		dim_t jc_start, jc_end;
+		bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+
+		for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
+		{
+			dim_t nc0 = bli_min( ( jc_end - jc ), NC );
+
+			dim_t jc_cur_loop = jc;
+			dim_t jc_cur_loop_rem = 0;
+			dim_t n_sub_updated;
+
+			get_B_panel_reordered_start_offset_width
+			(
+			  jc, n, NC, 16,
+			  &jc_cur_loop, &jc_cur_loop_rem,
+			  &nc0, &n_sub_updated
+			);
+
+			for ( dim_t pc = 0; pc < k; pc += KC )
+			{
+				dim_t kc0 = bli_min( ( k - pc ), KC );
+
+				// k needs to be a multiple of 2 so that it can be used with dpbf
+				// instruction. Padding is added in cases this condition is not
+				// satisfied, and therefore the k offset used for packed/reordered
+				// buffer needs to be updated.
+				dim_t kc0_updated = kc0;
+				kc0_updated += (kc0_updated & 0x1);
+
+				// The offsets are calculated in such a way that it resembles
+				// the reorder buffer traversal in single threaded reordering.
+				// The panel boundaries (KCxNC) remain as it is accessed in
+				// single thread, and as a consequence a thread with jc_start
+				// inside the panel cannot consider NC range for reorder. It
+				// has to work with NC' < NC, and the offset is calulated using
+				// prev NC panels spanning k dim + cur NC panel spaning pc loop
+				// cur iteration + (NC - NC') spanning current kc0 (<= KC).
+				//
+				//Eg: Consider the following reordered buffer diagram:
+				//          t1              t2
+				//          |               |
+				//          |           |..NC..|
+				//          |           |      |
+				//          |.NC. |.NC. |NC'|NC"
+				//     pc=0-+-----+-----+---+--+
+				//        KC|     |     |   |  |
+				//          |  1  |  3  |   5  |
+				//    pc=KC-+-----+-----+---st-+
+				//        KC|     |     |   |  |
+				//          |  2  |  4  | 6 | 7|
+				// pc=k=2KC-+-----+-----+---+--+
+				//          |jc=0 |jc=NC|jc=2NC|
+				//
+				// The numbers 1,2..6,7 denotes the order in which reordered
+				// KCxNC blocks are stored in memory, ie: block 1 followed by 2
+				// followed by 3, etc. Given two threads t1 and t2, and t2 needs
+				// to acces point st in the reorder buffer to write the data:
+				// The offset calulation logic will be:
+				// jc_cur_loop = 2NC, jc_cur_loop_rem = NC', pc = KC,
+				// n_sub_updated = NC, k = 2KC, kc0_updated = KC
+				//
+				// st = ( jc_cur_loop * k )    <traverse blocks 1,2,3,4>
+				//    + ( n_sub_updated * pc ) <traverse block 5>
+				//    + ( NC' * kc0_updated)   <traverse block 6>
+				( ( packb_bf16 )lcntx->packb_fun_ptr )
+				(
+				  ( ( ( bfloat16* )b_reorder->storage.aligned_buffer ) +
+					( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
+					( jc_cur_loop_rem * kc0_updated ) ),
+				  ( ( ( bfloat16* )b->storage.aligned_buffer ) +
+					( rs_b * pc ) + jc ),
+				  rs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
+				);
+			}
+
+			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
+		}
+	}
+
+	b_reorder->rs = rs_b_reorder;
+	b_reorder->cs = cs_b_reorder;
+	b_reorder->mtag = REORDERED;
+}
diff --git a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.h b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.h
index c1b83c1b75..42c8cb9ef6 100644
--- a/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.h
+++ b/addon/aocl_gemm/frame/bf16bf16f32/lpgemm_reorder_bf16.h
@@ -1,46 +1,48 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef LPGEMM_REORDER_BF16_H
-#define LPGEMM_REORDER_BF16_H
-
-#include "lpgemm_types.h"
-
-void reorderb_nr64_bf16bf16f32of32    
-  (
-    lpgemm_obj_t *b,
-    lpgemm_obj_t *b_reorder
-  );
-
-#endif // LPGEMM_REORDER_H
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_REORDER_BF16_H
+#define LPGEMM_REORDER_BF16_H
+
+#include "lpgemm_types.h"
+
+void reorderb_nr64_bf16bf16f32of32
+     (
+       lpgemm_obj_t * b,
+       lpgemm_obj_t * b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
+     );
+
+#endif // LPGEMM_REORDER_H
diff --git a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
index 6242ceebe8..1864d78330 100644
--- a/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
+++ b/addon/aocl_gemm/frame/f32f32f32/lpgemm_f32f32f32.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -37,6 +37,29 @@
 #include "lpgemm_types.h"
 #include "lpgemm_utils.h"
 #include "lpgemm_thrinfo_utils.h"
+#include "lpgemm_kernels.h"
+
+// Kernel function prototypes
+typedef void (*lpgemm_rowvar_f32)
+     (
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const float*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const float*,
+       const dim_t,
+       const dim_t,
+       float*,
+       const dim_t,
+       const dim_t,
+       const float,
+       const float,
+       lpgemm_post_op*,
+       lpgemm_post_op_attr
+     );
 
 void lpgemm_pack_a_f32f32f32of32
      (
@@ -51,197 +74,338 @@ void lpgemm_pack_a_f32f32f32of32
        cntx_t*      cntx
      );
 
+void lpgemm_pack_b_f32f32f32of32
+     (
+       const float* input_buf_addr_b,
+       float*       reorder_buf_addr_b,
+       const dim_t  n,
+       const dim_t  k,
+       const dim_t  rs_b,
+       const dim_t  cs_b,
+       const dim_t  ps_p,
+       const dim_t  NR,
+       cntx_t*      cntx
+     );
+
 LPGEMM_5LOOP(float,float,float,f32f32f32of32)
 {
-	// Query the global cntx.
-	cntx_t* cntx = bli_gks_query_cntx();
-
-	num_t dt = BLIS_FLOAT;
-
-	// Query the context for various blocksizes.
-	const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
-	const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
-	const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
-	const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
-	const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
-
-	// Strides are updated based on matrix packing/reordering.
-	const float* a_use = NULL;
-	dim_t rs_a_use = rs_a;
-	dim_t cs_a_use = cs_a;
-
-	const float* b_use = NULL;
-	dim_t rs_b_use = rs_b;
-	dim_t cs_b_use = cs_b;
-
-	float* c_use_jc = NULL;
-	float* c_use_ic = NULL;
-
-	// Only supporting row major with unit column strided C for now.
-	const dim_t cs_c_use = 1;
-
-	/* Compute partitioning step values for each matrix of each loop. */
-	inc_t ps_a_use;
-	inc_t ps_b_use;
-	auxinfo_t aux;
-
-	// Check if packing of A is required.
-	bool should_pack_A = bli_rntm_pack_a( rntm );
-
-	// Pack buffer for A.
-	float* pack_a_buffer_f32f32f32of32;
-	mem_t mem_a = BLIS_MEM_INITIALIZER;
-	siz_t mem_a_size_req = 0;
-
-	float one_local = *PASTEMAC(s,1);
-
-	trans_t transc = BLIS_NO_TRANSPOSE;
-	conj_t conjc = bli_extract_conj( transc );
-
-	// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
-	thrinfo_t thread_jc;
-	thrinfo_t thread_ic;
-
-	lpgemm_gen_thrinfo( thread, &thread_jc, &thread_ic );
-
-	// Compute the JC loop thread range for the current thread.
-	dim_t jc_start, jc_end;
-	bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
-
-	for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
-	{
-		dim_t nc0 = bli_min( ( jc_end - jc ), NC );
-		c_use_jc = c + jc;
-
-		dim_t jc_cur_loop = jc;
-		dim_t jc_cur_loop_rem = 0;
-		dim_t n_sub_updated;
-
-		if ( mtag_b == REORDERED )
-		{
-			get_B_panel_reordered_start_offset_width
-			(
-			  jc, n, NC, NR,
-			  &jc_cur_loop, &jc_cur_loop_rem,
-			  &nc0, &n_sub_updated
-			);
-		}
-
-		for ( dim_t pc = 0; pc < k; pc += KC )
-		{
-			float beta0 = ( pc == 0 ) ? beta : one_local;
-			dim_t kc0 = bli_min( ( k - pc ), KC );
-
-			if ( mtag_b == REORDERED )
-			{
-				// In multi-threaded scenarios, an extra offset into a given
-				// packed B panel is required, since the jc loop split can
-				// result in per thread start offset inside the panel, instead
-				// of panel boundaries.
-				b_use = b + ( jc_cur_loop * k ) +
-						( n_sub_updated * pc ) + ( jc_cur_loop_rem * kc0 );
-
-				rs_b_use = NR;
-				cs_b_use = 1;
-				ps_b_use = kc0;
-			}
-			else
-			{
-				b_use = b + ( pc * rs_b ) + ( jc * cs_b );
-				ps_b_use = 1;
-			}
-
-			dim_t ic_start, ic_end;
-			bli_thread_range_sub( &thread_ic, m, MR, FALSE, &ic_start, &ic_end );
-
-			for ( dim_t ic = ic_start; ic < ic_end; ic += MC )
-			{
-				dim_t mc0 = bli_min( ( ic_end - ic ), MC );
-				c_use_ic = c_use_jc + ( rs_c * ic );
-
-				if ( mtag_a == REORDERED )
-				{
-					// Extra space since packing does width in multiples of MR.
-					const dim_t m_updated = ( ( m + MR - 1 ) / MR ) * MR;
-					a_use = a + ( pc * m_updated ) + ( kc0 * ic );
-
-					rs_a_use = 1;
-					cs_a_use = MR;
-					ps_a_use = MR * kc0;
-				}
-				else if ( should_pack_A == TRUE )
-				{
-					// Extra space since packing does width in multiples of MR.
-					const dim_t mc0_updated = ( ( mc0 + MR - 1 ) / MR ) * MR;
-					mem_a_size_req = sizeof( float ) * mc0_updated * kc0;
-
-					lpgemm_alloc_mem_panel
-					(
-					  mem_a_size_req, BLIS_BUFFER_FOR_A_BLOCK,
-					  &mem_a, rntm
-					);
-					pack_a_buffer_f32f32f32of32 = ( float* )bli_mem_buffer( &mem_a );
-
-					rs_a_use = 1;
-					cs_a_use = MR;
-					ps_a_use = MR * kc0;
-
-					lpgemm_pack_a_f32f32f32of32
-					(
-					  ( a + ( rs_a * ic ) + pc ),
-					  pack_a_buffer_f32f32f32of32,
-					  mc0, kc0,
-					  rs_a, cs_a, ps_a_use, MR,
-					  cntx
-					);
-
-					a_use = pack_a_buffer_f32f32f32of32;
-				}
-				else
-				{
-					a_use = a + ( rs_a * ic ) + pc;
-					ps_a_use = MR * rs_a;
-				}
-
-				// Embed the panel stride of A within the auxinfo_t object. The
-				// millikernel will query and use this to iterate through
-				// micropanels of A (if needed).
+    // Query the global cntx.
+    cntx_t* cntx = bli_gks_query_cntx();
+
+    num_t dt = BLIS_FLOAT;
+
+    // Query the context for various blocksizes.
+    const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+    const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+    const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+    const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+    const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
+    /*ToDo: Based on context kernel 6x64m or 6x16m will be picked here */    
+    
+    // Strides are updated based on matrix packing/reordering.
+    const float* a_use = NULL;
+    dim_t rs_a_use = rs_a;
+    dim_t cs_a_use = cs_a;
+
+    const float* b_use = NULL;
+    dim_t rs_b_use = rs_b;
+    dim_t cs_b_use = cs_b;
+
+    float* c_use_jc = NULL;
+    float* c_use_ic = NULL;
+
+    dim_t rs_c_downscale = rs_c;
+
+    // Only supporting row major with unit column strided C for now.
+    const dim_t cs_c_use = 1;
+
+    /* Compute partitioning step values for each matrix of each loop. */
+    inc_t ps_a_use;
+    inc_t ps_b_use;
+    auxinfo_t aux;
+
+    // Check if packing of A is required.
+    bool should_pack_A = bli_rntm_pack_a( rntm );
+
+    // Pack buffer for A.
+    float* pack_a_buffer_f32f32f32of32;
+    mem_t mem_a = BLIS_MEM_INITIALIZER;
+    siz_t mem_a_size_req = 0;
+
+    // Check if packing of A is required.
+    bool should_pack_B = bli_rntm_pack_b( rntm );
+
+    // Pack buffer for B.
+    float* pack_b_buffer_f32f32f32of32;
+    mem_t mem_b = BLIS_MEM_INITIALIZER;
+    siz_t mem_b_size_req = 0;
+
+    float one_local = *PASTEMAC(s,1);
+
+    // To decide whether to apply post ops or not.
+    bool is_last_k = FALSE;
+
+    // To decide whether to use original s8 C or temp buffer for beta scale.
+    bool is_first_k = FALSE;
+
+    lpgemm_post_op_attr post_ops_attr;
+    if ( c_downscale == TRUE )
+    {
+        post_ops_attr.buf_downscale = c;
+    }
+    else
+    {
+        post_ops_attr.buf_downscale = NULL;
+    }
+
+    // Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
+    thrinfo_t thread_jc;
+    thrinfo_t thread_ic;
+
+    lpgemm_gen_thrinfo( thread, &thread_jc, &thread_ic );
+
+    // Compute the JC loop thread range for the current thread.
+    dim_t jc_start, jc_end;
+    bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+
+    for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
+    {
+        dim_t nc0 = bli_min( ( jc_end - jc ), NC );
+        c_use_jc = c + jc;
+
+        dim_t jc_cur_loop = jc;
+        dim_t jc_cur_loop_rem = 0;
+        dim_t n_sub_updated = 0;
+
+        if ( mtag_b == REORDERED )
+        {
+            get_B_panel_reordered_start_offset_width
+            (
+              jc, n, NC, NR,
+              &jc_cur_loop, &jc_cur_loop_rem,
+              &nc0, &n_sub_updated
+            );
+        }
+
+        for ( dim_t pc = 0; pc < k; pc += KC )
+        {
+            float beta0 = ( pc == 0 ) ? beta : one_local;
+            dim_t kc0 = bli_min( ( k - pc ), KC );
+
+            // No parallelization in k dim, k always starts at 0.
+            is_first_k = ( pc == 0 ) ? ( TRUE ) : ( FALSE );
+            post_ops_attr.is_first_k = is_first_k;
+
+            is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
+            post_ops_attr.is_last_k = is_last_k;
+
+            if ( ( mtag_b == PACK ) && ( should_pack_B == TRUE ) )
+            {
+                // Pack B chunks are based on jc work id.
+                dim_t jc_work_id = bli_thread_work_id( &thread_jc );
+
+                // Using child thrinfo (thread_ic) tid to decide chief thread
+                // per B matrix chunk (jc work id group)
+                if ( bli_thread_am_ochief( &thread_ic ) )
+                {
+                    // nc0 needs to be a multiple of 16 since this gives maximum
+                    // vectorization. Packing B always results in buffers with width
+                    // which is a multiple of 16. Subsequently the nc0 offsets used
+                    // for packed/reordered buffers needs to be updated.
+                    dim_t nc0_updated = make_multiple_of_n( nc0, NR );
+                    mem_b_size_req = sizeof( float ) * nc0_updated * kc0;
+
+                    lpgemm_alloc_mem_panel
+                    (
+                      mem_b_size_req, BLIS_BUFFER_FOR_B_PANEL,
+                      &mem_b, rntm
+                    );
+
+                    thread->comm[jc_work_id].sent_object = bli_mem_buffer(&mem_b);
+                }
+
+                // All threads in work group should wait till chief thread has
+                // finished allocating the packing buffers.
+                bli_thrcomm_barrier
+                (
+                  bli_thread_ocomm_id( &thread_ic ),
+                  &thread->comm[jc_work_id]
+                );
+
+                pack_b_buffer_f32f32f32of32 =
+                              ( float* ) thread->comm[jc_work_id].sent_object;
+                // Set the strides for pack buffer.
+                rs_b_use = NR;
+                cs_b_use = 1;
+                ps_b_use = kc0;
+
+                // Compute the B panel per thread loop range for parallel
+                // packing using ic_ways number of threads. Since atmost only
+                // ic_ways threads can be used, the thread_ic attributes are
+                // used to split the loop range.
+                dim_t jc_packb_start, jc_packb_end;
+                bli_thread_range_sub
+                (
+                  &thread_ic, nc0, NR, FALSE,
+                  &jc_packb_start, &jc_packb_end
+                );
+
+                // Ensure thread ranges are valid, especially cases where no:
+                // of threads available for parallelization are greater than
+                // no: of B panel NR chunks.
+                if ( ( jc_packb_end > jc_packb_start ) &&
+                     ( jc_packb_start < ( jc + nc0 ) ) )
+                {
+                    lpgemm_pack_b_f32f32f32of32
+                    (
+                      ( b + ( rs_b * pc ) + ( cs_b * jc ) + ( cs_b * jc_packb_start ) ),
+                      pack_b_buffer_f32f32f32of32 + ( jc_packb_start * kc0 ),
+                      ( jc_packb_end - jc_packb_start ), kc0,
+                      rs_b, cs_b, ( NR * ps_b_use ), NR,
+                      cntx
+                    );
+                }
+
+                // All threads in work group should wait till B matrix packing
+                // is completed by the participating threads.
+                bli_thrcomm_barrier
+                (
+                  bli_thread_ocomm_id( &thread_ic ),
+                  &thread->comm[jc_work_id]
+                );
+                b_use = pack_b_buffer_f32f32f32of32;
+            }
+            else if ( mtag_b == REORDERED )
+            {
+                // In multi-threaded scenarios, an extra offset into a given
+                // packed B panel is required, since the jc loop split can
+                // result in per thread start offset inside the panel, instead
+                // of panel boundaries.
+                b_use = b + ( jc_cur_loop * k ) +
+                        ( n_sub_updated * pc ) + ( jc_cur_loop_rem * kc0 );
+
+                rs_b_use = NR;
+                cs_b_use = 1;
+                ps_b_use = kc0;
+            }
+            else
+            {
+                b_use = b + ( pc * rs_b ) + ( jc * cs_b );
+                ps_b_use = 1;
+            }
+
+            dim_t ic_start, ic_end;
+            bli_thread_range_sub( &thread_ic, m, MR, FALSE, &ic_start, &ic_end );
+
+            for ( dim_t ic = ic_start; ic < ic_end; ic += MC )
+            {
+                dim_t mc0 = bli_min( ( ic_end - ic ), MC );
+                c_use_ic = c_use_jc + ( rs_c * ic );
+
+                if ( mtag_a == REORDERED )
+                {
+                    // Extra space since packing does width in multiples of MR.
+                    const dim_t m_updated = ( ( m + MR - 1 ) / MR ) * MR;
+                    a_use = a + ( pc * m_updated ) + ( kc0 * ic );
+
+                    rs_a_use = 1;
+                    cs_a_use = MR;
+                    ps_a_use = MR * kc0;
+                }
+                else if ( should_pack_A == TRUE )
+                {
+                    // Extra space since packing does width in multiples of MR.
+                    const dim_t mc0_updated = ( ( mc0 + MR - 1 ) / MR ) * MR;
+                    mem_a_size_req = sizeof( float ) * mc0_updated * kc0;
+
+                    lpgemm_alloc_mem_panel
+                    (
+                      mem_a_size_req, BLIS_BUFFER_FOR_A_BLOCK,
+                      &mem_a, rntm
+                    );
+                    pack_a_buffer_f32f32f32of32 = ( float* )bli_mem_buffer( &mem_a );
+
+                    rs_a_use = 1;
+                    cs_a_use = MR;
+                    ps_a_use = MR * kc0;
+
+                    lpgemm_pack_a_f32f32f32of32
+                    (
+                      ( a + ( rs_a * ic ) + pc ),
+                      pack_a_buffer_f32f32f32of32,
+                      mc0, kc0,
+                      rs_a, cs_a, ps_a_use, MR,
+                      cntx
+                    );
+
+                    a_use = pack_a_buffer_f32f32f32of32;
+                }
+                else
+                {
+                    a_use = a + ( rs_a * ic ) + pc;
+                    ps_a_use = MR * rs_a;
+                }
+
+                // Embed the panel stride of A within the auxinfo_t object. The
+                // millikernel will query and use this to iterate through
+                // micropanels of A (if needed).
                 bli_auxinfo_set_ps_a( ps_a_use, &aux );
 
-				for ( dim_t jr = 0; jr < nc0; jr += NR )
-				{
-					dim_t nr0 = bli_min( ( nc0 - jr ), NR );
-
-					// Reordered/unpacked B, reordered/unpacked A.
-					bli_sgemmsup_rv_zen_asm_6x16m
-					(
-					  conjc,
-					  conjc,
-					  mc0, nr0, kc0,
-					  &alpha,
-					  ( float* )a_use, rs_a_use, cs_a_use,
-					  ( float* )( b_use + ( jr * ps_b_use ) ), rs_b_use, cs_b_use,
-					  &beta0,
-					  ( c_use_ic + jr ), rs_c, cs_c_use,
-					  &aux, cntx
-					);
-				}
-			}
-		}
-		if ( mtag_b == REORDERED )
-		{
-			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
-		}
-	}
-
-	// Release pack buffers.
-	if ( should_pack_A == TRUE )
-	{
-		if ( bli_mem_is_alloc( &mem_a ) )
-		{
-			bli_membrk_release( rntm, &mem_a );
-		}
-	}
+                for ( dim_t jr = 0; jr < nc0; jr += NR )
+                {
+                    dim_t nr0 = bli_min( ( nc0 - jr ), NR );
+
+                    // Post ops meta attributes.
+                    post_ops_attr.post_op_c_i = ic;
+                    post_ops_attr.post_op_c_j = ( jc + jr );
+                    post_ops_attr.rs_c_downscale = rs_c_downscale;
+
+                    // Reordered/unpacked B, reordered/unpacked A.
+                    ( ( lpgemm_rowvar_f32 )lcntx->kern_fun_ptr )
+                    (
+                      mc0, nr0, kc0,
+                      ( float* )a_use, rs_a_use, cs_a_use, ps_a_use,
+                      ( float* )( b_use + ( jr * ps_b_use ) ), rs_b_use, cs_b_use,
+                      ( c_use_ic + jr ), rs_c, cs_c_use,
+                      alpha , beta0,
+                      post_op_list, post_ops_attr
+                    );
+                }
+            }
+        }
+        if ( mtag_b == REORDERED )
+        {
+            adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
+        }
+    }
+
+    // Release pack buffers.
+    if ( mtag_b == PACK )
+    {
+        // All threads in work group should wait till B matrix usage is
+        // completed by the participating threads.
+        bli_thrcomm_barrier
+        (
+          bli_thread_ocomm_id( &thread_jc ),
+          &thread->comm[bli_thread_work_id( &thread_jc)]
+        );
+
+        if ( bli_thread_am_ochief( &thread_ic ) )
+        {
+            if ( bli_mem_is_alloc( &mem_b ) )
+            {
+                bli_membrk_release( rntm, &mem_b );
+            }
+        }
+    }
+    if ( should_pack_A == TRUE )
+    {
+        if ( bli_mem_is_alloc( &mem_a ) )
+        {
+            bli_membrk_release( rntm, &mem_a );
+        }
+    }
 }
 
 void lpgemm_pack_a_f32f32f32of32
@@ -257,44 +421,99 @@ void lpgemm_pack_a_f32f32f32of32
        cntx_t*      cntx
      )
 {
-	float one_local  = *PASTEMAC(s,1);
-	float* restrict kappa_cast = &one_local;
-
-	// Set the schema to "column stored row panels" to indicate packing to conventional
-	// column-stored row panels.
-	pack_t schema = BLIS_PACKED_ROW_PANELS;
-	trans_t transc = BLIS_NO_TRANSPOSE;
-	conj_t conjc = bli_extract_conj( transc );
-
-	// Compute the total number of iterations we'll need.
-	dim_t m_iter = ( m + MR - 1 ) / MR;
-
-	inc_t cs_p = MR;
-
-	float* p_temp = reorder_buf_addr_a;
-	dim_t ir, it;
-	// Iterate over every logical micropanel in the source matrix.
-	for ( ir = 0, it = 0; it < m_iter; ir += MR, it += 1 )
-	{
-		dim_t panel_dim_i = bli_min( MR, m - ir );
-
-		const float* a_use = input_buf_addr_a + ( ir * rs_a );
-		float* p_use = p_temp;
-
-		PASTEMAC(s,packm_cxk)
-		(
-		  conjc,
-		  schema,
-		  panel_dim_i,
-		  MR,
-		  k,
-		  k,
-		  kappa_cast,
-		  ( float* )a_use, rs_a, cs_a,
-		  p_use, cs_p,
-		  cntx
-		);
-
-		p_temp += ps_p;
-	}
+    float one_local  = *PASTEMAC(s,1);
+    float* restrict kappa_cast = &one_local;
+
+    // Set the schema to "column stored row panels" to indicate packing to conventional
+    // column-stored row panels.
+    pack_t schema = BLIS_PACKED_ROW_PANELS;
+    trans_t transc = BLIS_NO_TRANSPOSE;
+    conj_t conjc = bli_extract_conj( transc );
+    // Compute the total number of iterations we'll need.
+    dim_t m_iter = ( m + MR - 1 ) / MR;
+
+    inc_t cs_p = MR;
+
+    float* p_temp = reorder_buf_addr_a;
+
+    dim_t ir, it;
+    // Iterate over every logical micropanel in the source matrix.
+    for ( ir = 0, it = 0; it < m_iter; ir += MR, it += 1 )
+    {
+        dim_t panel_dim_i = bli_min( MR, m - ir );
+
+        const float* a_use = input_buf_addr_a + ( ir * rs_a );
+        float* p_use = p_temp;
+
+        PASTEMAC(s,packm_cxk)
+        (
+          conjc,
+          schema,
+          panel_dim_i,
+          MR,
+          k,
+          k,
+          kappa_cast,
+          ( float* )a_use, rs_a, cs_a,
+          p_use, cs_p,
+          cntx
+        );
+
+        p_temp += ps_p;
+    }
+}
+
+void lpgemm_pack_b_f32f32f32of32
+     (
+       const float* input_buf_addr_b,
+       float*       reorder_buf_addr_b,
+       const dim_t  n,
+       const dim_t  k,
+       const dim_t  rs_b,
+       const dim_t  cs_b,
+       const dim_t  ps_p,
+       const dim_t  NR,
+       cntx_t*      cntx
+     )
+{
+    float one_local  = *PASTEMAC(s,1);
+    float* restrict kappa_cast = &one_local;
+
+    // Set the schema to "row stored column panels" to indicate packing to
+    // conventional row-stored column panels.
+    pack_t schema = BLIS_PACKED_COL_PANELS;
+    trans_t transc = BLIS_NO_TRANSPOSE;
+    conj_t conjc = bli_extract_conj( transc );
+    // Compute the total number of iterations we'll need.
+    dim_t n_iter = ( n + NR - 1 ) / NR;
+
+    inc_t rs_p = NR;
+
+    float* p_temp = reorder_buf_addr_b;
+
+    dim_t jr, it;
+    // Iterate over every logical micropanel in the source matrix.
+    for ( jr = 0, it = 0; it < n_iter; jr += NR, it += 1 )
+    {
+        dim_t panel_dim_i = bli_min( NR, n - jr );
+
+        const float* b_use = input_buf_addr_b + ( jr * cs_b );
+        float* p_use = p_temp;
+
+        PASTEMAC(s,packm_cxk)
+        (
+          conjc,
+          schema,
+          panel_dim_i,
+          NR,
+          k,
+          k,
+          kappa_cast,
+          ( float* )b_use, cs_b, rs_b,
+          p_use, rs_p,
+          cntx
+        );
+
+        p_temp += ps_p;
+    }
 }
diff --git a/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h b/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h
index 45328669de..62fc678faa 100644
--- a/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h
+++ b/addon/aocl_gemm/frame/lpgemm_5loop_interface_apis.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -56,10 +56,11 @@ void lpgemm_rowvar_ ## LP_SFX \
        C_type*               c, \
        const dim_t           rs_c, \
        const dim_t           cs_c, \
-       C_type                alpha, \
-       C_type                beta, \
+       const C_type          alpha, \
+       const C_type          beta, \
        rntm_t*               rntm, \
        lpgemm_thrinfo_t*     thread, \
+       lpgemm_cntx_t*        lcntx, \
        lpgemm_post_op*       post_op_list, \
        bool                  c_downscale \
      ) \
@@ -68,4 +69,6 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32);
 LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16);
 LPGEMM_5LOOP(float,float,float,f32f32f32of32);
 LPGEMM_5LOOP(bfloat16,bfloat16,float,bf16bf16f32of32);
+LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32);
+LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16);
 #endif // LPGEMM_5LOOP_INTF_H
diff --git a/addon/aocl_gemm/frame/lpgemm_config.c b/addon/aocl_gemm/frame/lpgemm_config.c
deleted file mode 100644
index 901ec087d2..0000000000
--- a/addon/aocl_gemm/frame/lpgemm_config.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-#include "lpgemm_config.h"
-
-lpgemm_cntx_t global_cntx_t_list[4]; //Only one op type supported now.
-
-BLIS_INLINE void lpgemm_set_block_sizes_global_cntx
-     (
-       AOCL_OPERATION_TYPE op_type,
-       dim_t MC,
-       dim_t NC,
-       dim_t KC,
-       dim_t NR,
-       dim_t MR
-     )
-{
-	global_cntx_t_list[op_type].blksz.MC = MC;
-	global_cntx_t_list[op_type].blksz.NC = NC;
-	global_cntx_t_list[op_type].blksz.KC = KC;
-	global_cntx_t_list[op_type].blksz.NR = NR;
-	global_cntx_t_list[op_type].blksz.MR = MR;
-}
-
-// Sets default block sizes for lpgemm. Currently only u8s8s32 supported.
-// Thread safety is not considered now since the block sizes are not expected
-// to be configurable from application.
-void aocl_lpgemm_init_global_cntx()
-{
-    lpgemm_set_block_sizes_global_cntx( U8S8S32OS32, 144, 1024, 2048, 64, 6 );
-    lpgemm_set_block_sizes_global_cntx( U8S8S16OS16, 144, 1024, 1024, 32, 6 );
-    lpgemm_set_block_sizes_global_cntx( BF16BF16F32OF32, 144, 1024, 2048, 64, 6 );
-}
-
-dim_t lpgemm_get_block_size_MC_global_cntx( AOCL_OPERATION_TYPE op_type )
-{
-	return global_cntx_t_list[op_type].blksz.MC;
-}
-
-dim_t lpgemm_get_block_size_NC_global_cntx( AOCL_OPERATION_TYPE op_type )
-{
-	return global_cntx_t_list[op_type].blksz.NC;
-}
-
-dim_t lpgemm_get_block_size_KC_global_cntx( AOCL_OPERATION_TYPE op_type )
-{
-	return global_cntx_t_list[op_type].blksz.KC;
-}
-
-dim_t lpgemm_get_block_size_NR_global_cntx( AOCL_OPERATION_TYPE op_type )
-{
-	return global_cntx_t_list[op_type].blksz.NR;
-}
-
-dim_t lpgemm_get_block_size_MR_global_cntx( AOCL_OPERATION_TYPE op_type )
-{
-	return global_cntx_t_list[op_type].blksz.MR;
-}
diff --git a/addon/aocl_gemm/frame/lpgemm_post_ops.c b/addon/aocl_gemm/frame/lpgemm_post_ops.c
index 63fb25765f..fffe14c0f8 100644
--- a/addon/aocl_gemm/frame/lpgemm_post_ops.c
+++ b/addon/aocl_gemm/frame/lpgemm_post_ops.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -83,6 +83,7 @@ void lpgemm_translate_to_post_ops_list
 		return; //Error, seq length exceeds max post ops permitted.
 	}
 
+	dim_t e_i = 0; //Multiple eltwise supported.
 	for ( dim_t i = 0; i < post_op_unparsed->seq_length; ++i )
 	{
 		// Dispatcher code
@@ -103,7 +104,7 @@ void lpgemm_translate_to_post_ops_list
 					{
 						LPGEMM_POST_OP_CODE tmp_code = POST_OPS_DISABLE;
 						// Eltwise algo dispatcher.
-						switch ( post_op_unparsed->eltwise.algo.algo_type )
+						switch ( ( post_op_unparsed->eltwise + e_i )->algo.algo_type )
 						{
 							case RELU:
 									tmp_code = POST_OPS_RELU;
@@ -111,6 +112,15 @@ void lpgemm_translate_to_post_ops_list
 							case PRELU:
 									tmp_code = POST_OPS_RELU_SCALE;
 									break;
+							case GELU_TANH:
+									tmp_code = POST_OPS_GELU_TANH;
+									break;
+							case GELU_ERF:
+									tmp_code = POST_OPS_GELU_ERF;
+									break;
+							case CLIP:
+									tmp_code = POST_OPS_CLIP;
+									break;
 							default:
 									break;
 						}
@@ -118,11 +128,12 @@ void lpgemm_translate_to_post_ops_list
 						(
 						  ( post_op_list + i ), tmp_code,
 						  NULL,
-						  post_op_unparsed->eltwise.algo.alpha,
-						  post_op_unparsed->eltwise.algo.beta,
-						  post_op_unparsed->eltwise.scale_factor,
-						  post_op_unparsed->eltwise.is_power_of_2
+						  ( post_op_unparsed->eltwise + e_i )->algo.alpha,
+						  ( post_op_unparsed->eltwise + e_i )->algo.beta,
+						  ( post_op_unparsed->eltwise + e_i )->scale_factor,
+						  ( post_op_unparsed->eltwise + e_i )->is_power_of_2
 						);
+						e_i += 1;
 					}
 					break;
 			case BIAS:
diff --git a/addon/aocl_gemm/frame/lpgemm_post_ops.h b/addon/aocl_gemm/frame/lpgemm_post_ops.h
index 3932daf602..7509e57a39 100644
--- a/addon/aocl_gemm/frame/lpgemm_post_ops.h
+++ b/addon/aocl_gemm/frame/lpgemm_post_ops.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,8 +41,11 @@ typedef enum
 	POST_OPS_BIAS = 1,
 	POST_OPS_RELU = 2,
 	POST_OPS_RELU_SCALE = 3,
-	POST_OPS_DOWNSCALE = 4,
-	POST_OPS_SUM = 5,
+	POST_OPS_GELU_TANH = 4,
+	POST_OPS_GELU_ERF = 5,
+	POST_OPS_CLIP = 6,
+	POST_OPS_DOWNSCALE = 7,
+	POST_OPS_SUM = 8,
 } LPGEMM_POST_OP_CODE;
 
 // Used as an internal structure.
@@ -57,6 +60,21 @@ typedef struct lpgemm_post_op_t
 	struct lpgemm_post_op_t* next;
 } lpgemm_post_op;
 
+// Used as an internal structure.
+typedef struct lpgemm_post_op_attr_t
+{
+	dim_t post_op_c_i;
+	dim_t post_op_c_j;
+	dim_t rs_c_downscale;
+	dim_t cs_c_downscale;
+	void* buf_downscale;
+	bool is_first_k;
+	bool is_last_k;
+	dim_t b_sum_offset;
+	int32_t* b_col_sum_vec;
+	int16_t* b_col_sum_vec_s16;
+} lpgemm_post_op_attr;
+
 void lpgemm_translate_to_post_ops_list
      (
        aocl_post_op*   post_op_unparsed,
@@ -66,7 +84,7 @@ void lpgemm_translate_to_post_ops_list
      );
 
 #define POST_OP_LABEL_LASTK_SAFE_JUMP \
-		if ( ( is_last_k == TRUE ) && ( post_ops_list_temp != NULL ) ) \
+		if ( ( post_ops_attr.is_last_k == TRUE ) && ( post_ops_list_temp != NULL ) ) \
 		{ \
 			goto *post_ops_labels[post_ops_list_temp->op_code]; \
 		} \
diff --git a/addon/aocl_gemm/frame/lpgemm_types.h b/addon/aocl_gemm/frame/lpgemm_types.h
index aebd485d0d..b700c03878 100644
--- a/addon/aocl_gemm/frame/lpgemm_types.h
+++ b/addon/aocl_gemm/frame/lpgemm_types.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -47,9 +47,20 @@ typedef enum
 {
 	U8S8S16OS16 = 0, // uint8_t - A, int8_t - B, int16_t - C
 	U8S8S32OS32 = 1, // uint8_t - A, int8_t - B, int32_t - C
-	F16F16F16OF16 = 2, // float16 - A, float16 - B, float16 - C
-	BF16BF16F32OF32 = 3 // bf16 - A, bf16 - B, float - C
+	F32F32F32OF32 = 2, // float - A, float - B, float - C
+	BF16BF16F32OF32 = 3, // bf16 - A, bf16 - B, float - C
+	S8S8S32OS32 = 4, // int8_t - A, int8_t - B, int32_t - C
+	S8S8S16OS16 = 5  // int8_t - A, int8_t - B, int16_t - C
 } AOCL_OPERATION_TYPE;
+#define AOCL_OPERATION_TYPE_LEN 6
+
+typedef enum
+{
+	F32_GELU_TANH = 0,
+	F32_GELU_ERF = 1,
+	F32_SOFTMAX = 2
+} AOCL_UTIL_OPERATION_TYPE;
+#define AOCL_UTIL_OPERATION_TYPE_LEN 3
 
 typedef enum
 {
@@ -100,11 +111,28 @@ typedef struct
 	dim_t MR;
 } lpgemm_block_size_t;
 
+typedef struct
+{
+	dim_t packa_rs;
+	dim_t packa_cs;
+	dim_t packb_rs;
+	dim_t packb_cs;
+} lpgemm_pack_strides_t;
+
 typedef struct
 {
 	lpgemm_block_size_t blksz;
+	void_fp kern_fun_ptr;
+	void_fp packa_fun_ptr;
+	void_fp packb_fun_ptr;
+	lpgemm_pack_strides_t pack_s;
 } lpgemm_cntx_t;
 
+typedef struct
+{
+	void_fp kern_fun_ptr;
+} lpgemm_util_cntx_t;
+
 typedef struct
 {
 	dim_t n_threads;
diff --git a/addon/aocl_gemm/frame/s8s8s16/lpgemm_reorder_s8s16.c b/addon/aocl_gemm/frame/s8s8s16/lpgemm_reorder_s8s16.c
new file mode 100644
index 0000000000..474014d5df
--- /dev/null
+++ b/addon/aocl_gemm/frame/s8s8s16/lpgemm_reorder_s8s16.c
@@ -0,0 +1,187 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "blis.h"
+#include "lpgemm_utils_s8.h"
+#include "lpgemm_reorder_s8s16.h"
+#include "lpgemm_packb_s8s16.h"
+#include "lpgemm_config.h"
+
+void aocl_reorderb_nr32_s8s8s16o16
+     (
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
+     )
+{
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t NR = lcntx->blksz.NR;
+
+	// Extracting the matrix properties from the lpgemm object
+	dim_t rs_b = b->rs;
+	dim_t n = b->width;
+	dim_t k = b->length;
+
+	lpgemm_mod_block_size_s16(0, n, k, NULL, &NC, &KC);
+
+	dim_t rs_b_reorder;
+	dim_t cs_b_reorder;
+
+	dim_t k_updated = k;
+
+	// Making multiple of 2 to suit k in vpmaddubsw
+	k_updated += (k_updated & 0x1);
+
+    dim_t n_updated = make_multiple_of_n( n, 16 );
+
+	dim_t n_threads = bli_rntm_num_threads( rntm );
+	n_threads = ( n_threads > 0 ) ? n_threads : 1;
+
+    // To access the last row of B matrix - Column sum of B matrix
+    int16_t* pack_b_column_sum = ( int16_t* ) ( b_reorder->storage.aligned_buffer + ( sizeof( int8_t ) * n_updated * k_updated ));
+	for (int idx = 0; idx < n_updated; idx++ )
+	{
+		*( pack_b_column_sum + idx ) =  0;
+	}
+
+#ifdef BLIS_ENABLE_OPENMP
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( n_threads, &thread_jc );
+		bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
+#else
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( 1, &thread_jc );
+		bli_thrinfo_set_work_id( 0, &thread_jc );
+#endif
+		// Compute the JC loop thread range for the current thread.
+		dim_t jc_start, jc_end;
+		bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+
+		for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
+		{
+			dim_t nc0 = bli_min( ( jc_end - jc ), NC );
+
+			dim_t jc_cur_loop = jc;
+			dim_t jc_cur_loop_rem = 0;
+			dim_t n_sub_updated;
+
+			get_B_panel_reordered_start_offset_width
+			(
+			  jc, n, NC, 16,
+			  &jc_cur_loop, &jc_cur_loop_rem,
+			  &nc0, &n_sub_updated
+			);
+
+			for ( dim_t pc = 0; pc < k; pc += KC )
+			{
+				dim_t kc0 = bli_min( ( k - pc ), KC );
+
+				// kc0 needs to be a multiple of 2 so that it can be used with
+				// vmaddubsw instruction. Padding is added in cases this
+				// condition is not satisfied, and therefore the kc0 offsets
+				// used for packed/reordered buffers needs to be updated.
+				dim_t kc0_updated = make_multiple_of_n( kc0, 2 );
+
+				// The offsets are calculated in such a way that it resembles
+				// the reorder buffer traversal in single threaded reordering.
+				// The panel boundaries (KCxNC) remain as it is accessed in
+				// single thread, and as a consequence a thread with jc_start
+				// inside the panel cannot consider NC range for reorder. It
+				// has to work with NC' < NC, and the offset is calulated using
+				// prev NC panels spanning k dim + cur NC panel spaning pc loop
+				// cur iteration + (NC - NC') spanning current kc0 (<= KC).
+				//
+				//Eg: Consider the following reordered buffer diagram:
+				//          t1              t2
+				//          |               |
+				//          |           |..NC..|
+				//          |           |      |
+				//          |.NC. |.NC. |NC'|NC"
+				//     pc=0-+-----+-----+---+--+
+				//        KC|     |     |   |  |
+				//          |  1  |  3  |   5  |
+				//    pc=KC-+-----+-----+---st-+
+				//        KC|     |     |   |  |
+				//          |  2  |  4  | 6 | 7|
+				// pc=k=2KC-+-----+-----+---+--+
+				//          |jc=0 |jc=NC|jc=2NC|
+				//
+				// The numbers 1,2..6,7 denotes the order in which reordered
+				// KCxNC blocks are stored in memory, ie: block 1 followed by 2
+				// followed by 3, etc. Given two threads t1 and t2, and t2 needs
+				// to acces point st in the reorder buffer to write the data:
+				// The offset calulation logic will be:
+				// jc_cur_loop = 2NC, jc_cur_loop_rem = NC', pc = KC,
+				// n_sub_updated = NC, k = 2KC, kc0_updated = KC
+				//
+				// st = ( jc_cur_loop * k )    <traverse blocks 1,2,3,4>
+				//    + ( n_sub_updated * pc ) <traverse block 5>
+				//    + ( NC' * kc0_updated)   <traverse block 6>
+				( ( packb_s16_s8 )lcntx->packb_fun_ptr )
+				(
+				  ( ( ( int8_t* )b_reorder->storage.aligned_buffer ) +
+					( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
+					( jc_cur_loop_rem * kc0_updated ) ),
+                    pack_b_column_sum + jc,
+				  ( ( ( int8_t* )b->storage.aligned_buffer ) +
+					( rs_b * pc ) + jc ),
+				  rs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
+				);
+			}
+
+			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
+		}
+	}
+	// for (int i =0; i< k_updated; i++)
+	// {
+	// 	for (int j=0; j< n_updated; j++)
+	// 	{
+	// 		printf(" %d ", *( int8_t* )(b->storage.aligned_buffer + i*n_updated + j ));
+	// 	}
+	// 	printf(" \n ");
+	// }
+	// for (int i =0; i< n_updated; i++)
+	// printf(" %d ", *(pack_b_column_sum + i));
+
+	// Changing the packed matrix properties in the packed matrix object
+	b_reorder->rs = rs_b_reorder;
+	b_reorder->cs = cs_b_reorder;
+	b_reorder->mtag = REORDERED;
+}
diff --git a/addon/aocl_gemm/frame/s8s8s16/lpgemm_reorder_s8s16.h b/addon/aocl_gemm/frame/s8s8s16/lpgemm_reorder_s8s16.h
new file mode 100644
index 0000000000..8a87474ad4
--- /dev/null
+++ b/addon/aocl_gemm/frame/s8s8s16/lpgemm_reorder_s8s16.h
@@ -0,0 +1,47 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef LPGEMM_REORDER_S8S16_H
+#define LPGEMM_REORDER_S8S16_H
+
+#include "lpgemm_types.h"
+
+void aocl_reorderb_nr32_s8s8s16o16
+     (
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
+     );
+
+#endif // LPGEMM_REORDER_S8S16_H
diff --git a/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c b/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c
new file mode 100644
index 0000000000..86ee194eb5
--- /dev/null
+++ b/addon/aocl_gemm/frame/s8s8s16/lpgemm_s8s8s16.c
@@ -0,0 +1,402 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "lpgemm_5loop_interface_apis.h"
+#include "lpgemm_packb_s8s16.h"
+#include "lpgemm_kernels.h"
+#include "lpgemm_utils_s8.h"
+#include "lpgemm_config.h"
+#include "lpgemm_thrinfo_utils.h"
+
+// Kernel function prototypes
+typedef void (*lpgemm_rowvar_s16_s8)
+     (
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       int16_t*,
+       const dim_t,
+       const dim_t,
+       const int16_t,
+       const int16_t,
+       lpgemm_post_op*,
+       lpgemm_post_op_attr
+     );
+
+// B should always be packed.
+LPGEMM_5LOOP(int8_t,int8_t,int16_t,s8s8s16o16)
+{
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t MC = lcntx->blksz.MC;
+	const dim_t NR = lcntx->blksz.NR;
+	const dim_t MR = lcntx->blksz.MR;
+
+	lpgemm_mod_block_size_s16(m, n, k, &MC, &NC, &KC);
+
+	if (mtag_b == UNPACKED)
+	{
+		// Error: can only work with packed B now.
+		return;
+	}
+
+	const int8_t *b_use;
+	const int8_t *a_use;
+	dim_t rs_a_use = rs_a;
+	dim_t cs_a_use = cs_a;
+
+	dim_t rs_b_use = rs_b;
+	dim_t cs_b_use = cs_b;
+
+	int16_t *c_use_jc = NULL;
+	int16_t *c_use_ic = NULL;
+	dim_t rs_c_use = rs_c;
+	dim_t rs_c_downscale = rs_c;
+
+	// Pack buffer for B.
+	int8_t *pack_b_buffer_s8s8s16o16;
+	mem_t mem_b = BLIS_MEM_INITIALIZER;
+	dim_t packb_min_NR = 16;
+	siz_t mem_b_size_req = 0;
+
+	// Temporary buffer for C accumulation when downscaling is required.
+	int16_t* temp_scal_c_buffer_s8s8s16o16;
+	mem_t mem_scale_c = BLIS_MEM_INITIALIZER;
+	siz_t mem_scale_c_size_req = 0;
+
+	// Making multiple of 2 to suit k in vpmaddubsw
+	dim_t k_updated = make_multiple_of_n( k, 2 );
+
+    // Making multiple of 16
+    dim_t n_updated = make_multiple_of_n( n, 16 );
+
+	// To decide whether to apply post ops or not.
+	bool is_last_k = FALSE;
+
+	// To decide whether to use original s8 C or temp buffer for beta scale.
+	bool is_first_k = FALSE;
+
+	lpgemm_post_op_attr post_ops_attr;
+	if ( c_downscale == TRUE )
+	{
+		post_ops_attr.buf_downscale = c;
+	}
+	else
+	{
+		post_ops_attr.buf_downscale = NULL;
+	}
+
+	// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
+	thrinfo_t thread_jc;
+	thrinfo_t thread_ic;
+
+	lpgemm_gen_thrinfo(thread, &thread_jc, &thread_ic);
+
+	// Compute the JC, IC loop thread range for the current thread.
+	dim_t jc_start, jc_end;
+	bli_thread_range_sub(&thread_jc, n, NR, FALSE, &jc_start, &jc_end);
+
+	dim_t ic_start, ic_end;
+	bli_thread_range_sub(&thread_ic, m, MR, FALSE, &ic_start, &ic_end);
+
+	for (dim_t jc = jc_start; jc < jc_end; jc += NC)
+	{
+		dim_t nc0 = bli_min((jc_end - jc), NC);
+
+		dim_t jc_cur_loop = jc;
+		dim_t jc_cur_loop_rem = 0;
+		dim_t n_sub_updated = 0;
+
+		if (mtag_b == REORDERED)
+		{
+			get_B_panel_reordered_start_offset_width
+			(
+				jc, n, NC, packb_min_NR,
+				&jc_cur_loop, &jc_cur_loop_rem,
+				&nc0, &n_sub_updated
+			);
+		}
+
+		if ( c_downscale == FALSE )
+		{
+			c_use_jc = c + jc;
+		}
+		// Temp accumulaton buffer for C allocation.
+		else if ( c_downscale == TRUE )
+		{
+			// Buffer memory is only required if output needs to be
+			// persisted across iterations of the pc/KC loop.
+			// It was observed that the locks used while checking out
+			// a buffer from memory pool had an impact on performance
+			// and is better to not checkout if k <= KC.
+			if ( k > KC )
+			{
+				mem_scale_c_size_req = sizeof( int16_t ) * nc0 * ( ic_end - ic_start );
+
+				lpgemm_alloc_mem_panel
+				(
+				  mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
+				  &mem_scale_c, rntm
+				);
+
+				temp_scal_c_buffer_s8s8s16o16 = bli_mem_buffer( &mem_scale_c );
+
+				c_use_jc = ( int16_t* )temp_scal_c_buffer_s8s8s16o16;
+			}
+
+			// The temp c buffer stride is modified as opposed to original C matrix.
+			rs_c_use = nc0;
+		}
+
+		int16_t* pack_b_column_sum = NULL;
+
+		for (dim_t pc = 0; pc < k; pc += KC)
+		{
+			int16_t beta0 = (pc == 0) ? beta : 1;
+			dim_t kc0 = bli_min((k - pc), KC);
+
+			// No parallelization in k dim, k always starts at 0.
+			is_first_k = ( pc == 0 ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_first_k = is_first_k;
+
+			is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_last_k = is_last_k;
+
+			// kc0 needs to be a multiple of 2 so that it can be
+			// used with vpmaddubsw instruction. Padding is added in
+			// cases this condition is not satisfied, and therefore
+			// the kc0 offsets used for packed/reordered buffers
+			// needs to be updated.
+			dim_t kc0_updated = make_multiple_of_n(kc0, 2);
+
+			if (mtag_b == PACK)
+			{
+				// Pack B chunks are based on jc work id.
+				dim_t jc_work_id = bli_thread_work_id(&thread_jc);
+
+				// Using child thrinfo (thread_ic) tid to decide chief thread
+				// per B matrix chunk (jc work id group)
+
+				// nc0 needs to be a multiple of 16 since this gives maximum
+				// vectorization. Packing B always results in buffers with width
+				// which is a multiple of 16. Subsequently the nc0 offsets used
+				// for packed/reordered buffers needs to be updated.
+				dim_t nc0_updated = make_multiple_of_n(nc0, packb_min_NR);
+
+				if (bli_thread_am_ochief(&thread_ic))
+				{
+					mem_b_size_req = sizeof(int8_t) * nc0_updated * kc0_updated + ( nc0_updated * sizeof( int16_t ) );
+
+					lpgemm_alloc_mem_panel(
+						mem_b_size_req, BLIS_BUFFER_FOR_B_PANEL,
+						&mem_b, rntm);
+
+					thread->comm[jc_work_id].sent_object =
+						bli_mem_buffer(&mem_b);
+				}
+
+				// All threads in work group should wait till chief thread has
+				// finished allocating the packing buffers.
+				bli_thrcomm_barrier
+				(
+				  bli_thread_ocomm_id(&thread_ic),
+				  &thread->comm[jc_work_id]
+				);
+
+				pack_b_buffer_s8s8s16o16 =
+					(int8_t *)thread->comm[jc_work_id].sent_object;
+
+				// Compute the B panel per thread loop range for parallel
+				// packing using ic_ways number of threads. Since atmost only
+				// ic_ways threads can be used, the thread_ic attributes are
+				// used to split the loop range.
+				dim_t jc_packb_start, jc_packb_end;
+				bli_thread_range_sub
+				(
+					&thread_ic, nc0, NR, FALSE,
+					&jc_packb_start, &jc_packb_end
+				);
+
+				if ( pc == 0)
+				{
+					pack_b_column_sum = ( int16_t* )( pack_b_buffer_s8s8s16o16 + ( sizeof( int8_t ) * nc0_updated * kc0_updated ) );
+				}
+
+				// Ensure thread ranges are valid, especially cases where no:
+				// of threads available for parallelization are greater than
+				// no: of B panel NR chunks.
+				if ((jc_packb_end > jc_packb_start) &&
+					(jc_packb_start < (jc + nc0)))
+				{
+					if ( pc == 0 )
+					{
+						for (int idx = jc_packb_start; idx < jc_packb_end; idx++ )
+						{
+							*( pack_b_column_sum + idx ) =  0;
+						}
+					}
+
+					( ( packb_s16_s8 )lcntx->packb_fun_ptr )
+					(
+					  pack_b_buffer_s8s8s16o16 +
+					  (jc_packb_start * kc0_updated),
+					  pack_b_column_sum + ( cs_b * jc_packb_start ), 
+					  (b + (rs_b * pc) + (cs_b * jc) +
+					  (cs_b * jc_packb_start)),
+					  rs_b,
+					  (jc_packb_end - jc_packb_start), kc0,
+					  &rs_b_use, &cs_b_use
+					);
+				}
+				else
+				{
+					lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
+				}
+
+				// All threads in work group should wait till B matrix packing
+				// is completed by the participating threads.
+				bli_thrcomm_barrier
+				(
+					bli_thread_ocomm_id(&thread_ic),
+					&thread->comm[jc_work_id]
+				);
+
+				b_use = pack_b_buffer_s8s8s16o16;
+				post_ops_attr.b_col_sum_vec_s16 = pack_b_column_sum;
+			}
+			else if (mtag_b == REORDERED)
+			{
+				// In multi-threaded scenarios, an extra offset into a given
+				// packed B panel is required, since the jc loop split can
+				// result in per thread start offset inside the panel, instead
+				// of panel boundaries.
+				b_use = b + (jc_cur_loop * k_updated) +
+						(n_sub_updated * pc) +
+						(jc_cur_loop_rem * kc0_updated);
+
+				lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
+
+				post_ops_attr.b_col_sum_vec_s16 = ( ( int16_t* )( b + ( k_updated * n_updated ) ) ) + jc;
+			}
+			else
+			{
+				// Unpacked B not supported.
+				return;
+			}
+
+			for (dim_t ic = ic_start; ic < ic_end; ic += MC)
+			{
+				dim_t mc0 = bli_min((ic_end - ic), MC);
+
+				// Only per thread C matrix is stored in temp buffer, so both
+				// per thread jc and ic start should be normalized to zero.
+				if ( c_downscale == TRUE )
+				{
+					c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
+				}
+				else
+				{
+					c_use_ic = c_use_jc + ( rs_c_use * ic );
+				}
+
+				a_use = a + (rs_a * ic) + (cs_a * pc);
+				cs_a_use = 1;
+
+				dim_t a_block_stride = rs_a;
+
+				post_ops_attr.b_sum_offset = 0;
+
+				for (dim_t jr = 0; jr < nc0; jr += NR)
+				{
+					dim_t nr0 = bli_min((nc0 - jr), NR);
+
+					// Post ops meta attributes.
+					post_ops_attr.post_op_c_i = ic;
+					post_ops_attr.post_op_c_j = ( jc + jr );
+					post_ops_attr.rs_c_downscale = rs_c_downscale;
+
+					// Calls for reorder B
+					( ( lpgemm_rowvar_s16_s8 )lcntx->kern_fun_ptr )
+					(
+						mc0, nr0, kc0,
+						a_use, rs_a_use, cs_a_use, a_block_stride,
+						(b_use + (jr * kc0_updated)), rs_b_use, cs_b_use,
+						(c_use_ic + jr), rs_c_use, 1,
+						alpha, beta0,
+					  	post_op_list, post_ops_attr
+					);
+					post_ops_attr.b_sum_offset += NR;
+				}
+			}
+		}
+
+		if (mtag_b == REORDERED)
+		{
+			adjust_B_panel_reordered_jc(&jc, jc_cur_loop);
+		}
+	}
+
+	// Release pack buffers.
+	if (mtag_b == PACK)
+	{
+		// All threads in work group should wait till B matrix usage is
+		// completed by the participating threads.
+		bli_thrcomm_barrier(
+			bli_thread_ocomm_id(&thread_jc),
+			&thread->comm[bli_thread_work_id(&thread_jc)]);
+
+		if (bli_thread_am_ochief(&thread_ic))
+		{
+			if (bli_mem_is_alloc(&mem_b))
+			{
+				bli_membrk_release(rntm, &mem_b);
+			}
+		}
+	}
+	if ( c_downscale == TRUE )
+	{
+		if ( bli_mem_is_alloc( &mem_scale_c ) )
+		{
+			bli_membrk_release( rntm, &mem_scale_c );
+		}
+	}
+}
diff --git a/addon/aocl_gemm/frame/s8s8s32/lpgemm_reorder_s8.c b/addon/aocl_gemm/frame/s8s8s32/lpgemm_reorder_s8.c
new file mode 100644
index 0000000000..ece6c48762
--- /dev/null
+++ b/addon/aocl_gemm/frame/s8s8s32/lpgemm_reorder_s8.c
@@ -0,0 +1,220 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "lpgemm_utils_s8.h"
+#include "lpgemm_reorder_s8.h"
+#include "lpgemm_packa_s8.h"
+#include "lpgemm_packb_s8.h"
+#include "lpgemm_config.h"
+
+void reorderb_nr64_s8s8s32o32
+     (
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
+     )
+{
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t NR = lcntx->blksz.NR;
+
+	dim_t rs_b = b->rs;
+	dim_t rs_b_reorder;
+	dim_t cs_b_reorder;
+
+	dim_t n = b->width;
+	dim_t k = b->length;
+
+	// k needs to be a multiple of 4 so that it can be used with vpdpbusd
+	// instruction. Padding is added in cases this condition is not
+	// satisfied, and therefore the k offset used for packed/reordered
+	// buffer needs to be updated.
+	dim_t k_updated = make_multiple_of_n( k, 4 );
+	dim_t n_updated = make_multiple_of_n( n, 16 );
+
+	dim_t n_threads = bli_rntm_num_threads( rntm );
+	n_threads = ( n_threads > 0 ) ? n_threads : 1;
+
+	int32_t* pack_b_column_sum = ( int32_t* ) ( b_reorder->storage.aligned_buffer + ( sizeof( int8_t ) * n_updated * k_updated ));
+	for ( dim_t idx = 0; idx < n_updated; idx++ )
+	{
+		*( pack_b_column_sum + idx ) =  0;
+	}
+
+#ifdef BLIS_ENABLE_OPENMP
+	_Pragma( "omp parallel num_threads(n_threads)" )
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( n_threads, &thread_jc );
+		bli_thrinfo_set_work_id( omp_get_thread_num(), &thread_jc );
+#else
+	{
+		// Initialise a local thrinfo obj for work split across threads.
+		thrinfo_t thread_jc;
+		bli_thrinfo_set_n_way( 1, &thread_jc );
+		bli_thrinfo_set_work_id( 0, &thread_jc );
+#endif
+		// Compute the JC loop thread range for the current thread.
+		dim_t jc_start, jc_end;
+		bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+
+		for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
+		{
+			dim_t nc0 = bli_min( ( jc_end - jc ), NC );
+
+			dim_t jc_cur_loop = jc;
+			dim_t jc_cur_loop_rem = 0;
+			dim_t n_sub_updated;
+
+			get_B_panel_reordered_start_offset_width
+			(
+			  jc, n, NC, get_packb_s8s8s32o32_min_NR(),
+			  &jc_cur_loop, &jc_cur_loop_rem,
+			  &nc0, &n_sub_updated
+			);
+
+			for ( dim_t pc = 0; pc < k; pc += KC )
+			{
+				dim_t kc0 = bli_min( ( k - pc ), KC );
+
+				// kc0 needs to be a multiple of 4 so that it can be used with
+				// vpdpbusd instruction. Padding is added in cases this
+				// condition is not satisfied, and therefore the kc0 offsets
+				// used for packed/reordered buffers needs to be updated.
+				dim_t kc0_updated = make_multiple_of_n( kc0, 4 );
+
+				// The offsets are calculated in such a way that it resembles
+				// the reorder buffer traversal in single threaded reordering.
+				// The panel boundaries (KCxNC) remain as it is accessed in
+				// single thread, and as a consequence a thread with jc_start
+				// inside the panel cannot consider NC range for reorder. It
+				// has to work with NC' < NC, and the offset is calulated using
+				// prev NC panels spanning k dim + cur NC panel spaning pc loop
+				// cur iteration + (NC - NC') spanning current kc0 (<= KC).
+				//
+				//Eg: Consider the following reordered buffer diagram:
+				//          t1              t2
+				//          |               |
+				//          |           |..NC..|
+				//          |           |      |
+				//          |.NC. |.NC. |NC'|NC"
+				//     pc=0-+-----+-----+---+--+
+				//        KC|     |     |   |  |
+				//          |  1  |  3  |   5  |
+				//    pc=KC-+-----+-----+---st-+
+				//        KC|     |     |   |  |
+				//          |  2  |  4  | 6 | 7|
+				// pc=k=2KC-+-----+-----+---+--+
+				//          |jc=0 |jc=NC|jc=2NC|
+				//
+				// The numbers 1,2..6,7 denotes the order in which reordered
+				// KCxNC blocks are stored in memory, ie: block 1 followed by 2
+				// followed by 3, etc. Given two threads t1 and t2, and t2 needs
+				// to acces point st in the reorder buffer to write the data:
+				// The offset calulation logic will be:
+				// jc_cur_loop = 2NC, jc_cur_loop_rem = NC', pc = KC,
+				// n_sub_updated = NC, k = 2KC, kc0_updated = KC
+				//
+				// st = ( jc_cur_loop * k )    <traverse blocks 1,2,3,4>
+				//    + ( n_sub_updated * pc ) <traverse block 5>
+				//    + ( NC' * kc0_updated)   <traverse block 6>
+				( ( packb_s32_s8 )lcntx->packb_fun_ptr )
+				(
+				  ( ( ( int8_t* )b_reorder->storage.aligned_buffer ) +
+					( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
+					( jc_cur_loop_rem * kc0_updated ) ),
+					pack_b_column_sum + jc,
+				  ( ( ( int8_t* )b->storage.aligned_buffer ) +
+					( rs_b * pc ) + jc ),
+				  rs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
+				);
+			}
+			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
+		}
+	}
+
+	b_reorder->rs = rs_b_reorder;
+	b_reorder->cs = cs_b_reorder;
+	b_reorder->mtag = REORDERED;
+}
+
+void reordera_mr6_s8s8s32o32
+     (
+       lpgemm_obj_t*  a,
+       lpgemm_obj_t*  a_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
+     )
+{
+	dim_t MC = lcntx->blksz.MC;
+	dim_t KC = lcntx->blksz.KC;
+
+	dim_t rs_a = a->rs;
+	dim_t rs_a_reorder;
+	dim_t cs_a_reorder;
+
+	dim_t k = a->width;
+	dim_t m = a->length;
+
+	for ( dim_t pc = 0; pc < k; pc += KC )
+	{
+		dim_t kc0 = bli_min( ( k - pc ), KC );
+
+		// kc0 needs to be a multiple of 4 so that it can be used with
+		// vpdpbusd instruction. Padding is added in cases this
+		// condition is not satisfied, and therefore the kc0 offsets
+		// used for packed/reordered buffers needs to be updated.
+		dim_t kc0_updated = make_multiple_of_n( kc0, 4 );
+
+		for ( dim_t ic = 0; ic < m; ic += MC )
+		{
+			dim_t mc0 = bli_min( ( m - ic ), MC );
+
+			( ( packa_s32_s8 )lcntx->packa_fun_ptr )
+			(
+			  ( ( ( int8_t* )a_reorder->storage.aligned_buffer ) + ( pc * m ) +
+				( ic * kc0_updated ) ),
+			  ( ( ( int8_t* )a->storage.aligned_buffer ) + ( rs_a * ic ) + pc ),
+			  rs_a, mc0, kc0, &rs_a_reorder, &cs_a_reorder
+			);
+		}
+	}
+
+	a_reorder->rs = rs_a_reorder;
+	a_reorder->cs = cs_a_reorder;
+	a_reorder->mtag = REORDERED;
+}
diff --git a/addon/aocl_gemm/frame/s8s8s32/lpgemm_reorder_s8.h b/addon/aocl_gemm/frame/s8s8s32/lpgemm_reorder_s8.h
new file mode 100644
index 0000000000..62bbfdeb64
--- /dev/null
+++ b/addon/aocl_gemm/frame/s8s8s32/lpgemm_reorder_s8.h
@@ -0,0 +1,57 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_REORDER_H_S8
+#define LPGEMM_REORDER_H_S8
+
+#include "lpgemm_types.h"
+
+void reorderb_nr64_s8s8s32o32
+     (
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
+     );
+
+void reordera_mr6_s8s8s32o32
+     (
+       lpgemm_obj_t*  a,
+       lpgemm_obj_t*  a_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
+     );
+
+#endif //LPGEMM_REORDER_H_S8
+
diff --git a/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c b/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c
new file mode 100644
index 0000000000..98b8081b51
--- /dev/null
+++ b/addon/aocl_gemm/frame/s8s8s32/lpgemm_s8s8s32.c
@@ -0,0 +1,447 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "lpgemm_5loop_interface_apis.h"
+#include "lpgemm_packa_s8.h"
+#include "lpgemm_packb_s8.h"
+#include "lpgemm_kernels.h"
+#include "lpgemm_utils_s8.h"
+#include "lpgemm_thrinfo_utils.h"
+#include "lpgemm_config.h"
+
+// Kernel function prototypes
+typedef void (*lpgemm_rowvar_s32_s8)
+     (
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       int32_t*,
+       const dim_t,
+       const dim_t,
+       const int32_t,
+       const int32_t,
+       lpgemm_post_op*,
+       lpgemm_post_op_attr
+     );
+
+// B should always be packed.
+LPGEMM_5LOOP(int8_t,int8_t,int32_t,s8s8s32o32)
+{
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t MC = lcntx->blksz.MC;
+	dim_t NR = lcntx->blksz.NR;
+	dim_t MR = lcntx->blksz.MR;
+
+	if ( mtag_b == UNPACKED )
+	{
+		//Error: can only work with packed B now.
+		return;
+	}
+
+	// Strides are updated based on matrix packing/reordering.
+	const int8_t* a_use = NULL;
+	dim_t rs_a_use = rs_a;
+	dim_t cs_a_use = cs_a;
+	dim_t a_block_stride = 0;
+
+	const int8_t* b_use = NULL;
+	dim_t rs_b_use = rs_b;
+	dim_t cs_b_use = cs_b;
+
+	int32_t* c_use_jc = NULL;
+	int32_t* c_use_ic = NULL;
+	dim_t rs_c_use = rs_c;
+	dim_t rs_c_downscale = rs_c;
+
+	// Pack buffer for A.
+	int8_t* pack_a_buffer_s8s8s32o32;
+	mem_t mem_a = BLIS_MEM_INITIALIZER;
+	siz_t mem_a_size_req = 0;
+
+	// Pack buffer for B.
+	int8_t* pack_b_buffer_s8s8s32o32;
+	mem_t mem_b = BLIS_MEM_INITIALIZER;
+	siz_t mem_b_size_req = 0;
+	dim_t packb_min_NR = get_packb_s8s8s32o32_min_NR();
+
+	// Temporary buffer for C accumulation when downscaling is required.
+	int32_t* temp_scal_c_buffer_s8s8s32o32;
+	mem_t mem_scale_c = BLIS_MEM_INITIALIZER;
+	siz_t mem_scale_c_size_req = 0;
+
+	// kc needs to be a multiple of 4 so that it can be used with vpdpbusd
+	// instruction. Padding is added in cases this condition is not
+	// satisfied, and therefore the k offset used for packed/reordered
+	// buffer needs to be updated.
+	dim_t k_updated = make_multiple_of_n( k, 4 );
+	dim_t n_updated = make_multiple_of_n( n, 16 );
+
+	// To decide whether to apply post ops or not.
+	bool is_last_k = FALSE;
+
+	// To decide whether to use original s8 C or temp buffer for beta scale.
+	bool is_first_k = FALSE;
+
+	lpgemm_post_op_attr post_ops_attr;
+	if ( c_downscale == TRUE )
+	{
+		post_ops_attr.buf_downscale = c;
+	}
+	else
+	{
+		post_ops_attr.buf_downscale = NULL;
+	}
+
+	// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
+	thrinfo_t thread_jc;
+	thrinfo_t thread_ic;
+
+	lpgemm_gen_thrinfo( thread, &thread_jc, &thread_ic );
+
+	// Compute the JC, IC loop thread range for the current thread.
+	dim_t jc_start, jc_end;
+	bli_thread_range_sub( &thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+
+	dim_t ic_start, ic_end;
+	bli_thread_range_sub( &thread_ic, m, MR, FALSE, &ic_start, &ic_end );
+
+	for ( dim_t jc = jc_start; jc < jc_end; jc += NC )
+	{
+		dim_t nc0 = bli_min( ( jc_end - jc ), NC );
+
+		dim_t jc_cur_loop = jc;
+		dim_t jc_cur_loop_rem = 0;
+		dim_t n_sub_updated = 0;
+
+		if ( mtag_b == REORDERED )
+		{
+			get_B_panel_reordered_start_offset_width
+			(
+			  jc, n, NC, packb_min_NR,
+			  &jc_cur_loop, &jc_cur_loop_rem,
+			  &nc0, &n_sub_updated
+			);
+		}
+
+		if ( c_downscale == FALSE )
+		{
+			c_use_jc = c + jc;
+		}
+		// Temp accumulaton buffer for C allocation.
+		else if ( c_downscale == TRUE )
+		{
+			// Buffer memory is only required if output needs to be
+			// persisted across iterations of the pc/KC loop.
+			// It was observed that the locks used while checking out
+			// a buffer from memory pool had an impact on performance
+			// and is better to not checkout if k <= KC.
+			if ( k > KC )
+			{
+				mem_scale_c_size_req = sizeof( int32_t ) * nc0 * ( ic_end - ic_start );
+
+				lpgemm_alloc_mem_panel
+				(
+				  mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
+				  &mem_scale_c, rntm
+				);
+
+				temp_scal_c_buffer_s8s8s32o32 = bli_mem_buffer( &mem_scale_c );
+
+				c_use_jc = ( int32_t* )temp_scal_c_buffer_s8s8s32o32;
+			}
+
+			// The temp c buffer stride is modified as opposed to original C matrix.
+			rs_c_use = nc0;
+		}
+
+		int32_t* pack_b_column_sum = NULL;
+
+		for ( dim_t pc = 0; pc < k; pc += KC )
+		{
+			int32_t beta0 = ( pc == 0 ) ? beta : 1;
+			dim_t kc0 = bli_min( ( k - pc ), KC );
+
+			// kc0 needs to be a multiple of 4 so that it can be
+			// used with vpdpbusd instruction. Padding is added in
+			// cases this condition is not satisfied, and therefore
+			// the kc0 offsets used for packed/reordered buffers
+			// needs to be updated.
+			dim_t kc0_updated = make_multiple_of_n( kc0, 4 );
+
+			// No parallelization in k dim, k always starts at 0.
+			is_first_k = ( pc == 0 ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_first_k = is_first_k;
+
+			is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_last_k = is_last_k;
+
+			if ( mtag_b == PACK )
+			{
+				// Pack B chunks are based on jc work id.
+				dim_t jc_work_id = bli_thread_work_id( &thread_jc );
+
+				// Using child thrinfo (thread_ic) tid to decide chief thread
+				// per B matrix chunk (jc work id group)
+				dim_t nc0_updated = make_multiple_of_n( nc0, packb_min_NR );
+
+				if ( bli_thread_am_ochief( &thread_ic ) )
+				{
+					// nc0 needs to be a multiple of 16 since this gives maximum
+					// vectorization. Packing B always results in buffers with width
+					// which is a multiple of 16. Subsequently the nc0 offsets used
+					// for packed/reordered buffers needs to be updated.pack
+
+					mem_b_size_req = sizeof( int8_t ) * nc0_updated * kc0_updated + ( nc0_updated * sizeof( int32_t ) );
+
+					lpgemm_alloc_mem_panel
+					(
+					  mem_b_size_req, BLIS_BUFFER_FOR_B_PANEL,
+					  &mem_b, rntm
+					);
+
+					thread->comm[jc_work_id].sent_object = bli_mem_buffer( &mem_b );
+				}
+
+				// All threads in work group should wait till chief thread has
+				// finished allocating the packing buffers.
+				bli_thrcomm_barrier
+				(
+				  bli_thread_ocomm_id( &thread_ic ),
+				  &thread->comm[jc_work_id]
+				);
+
+				pack_b_buffer_s8s8s32o32 =
+						( int8_t* ) thread->comm[jc_work_id].sent_object;
+
+				// Compute the B panel per thread loop range for parallel
+				// packing using ic_ways number of threads. Since atmost only
+				// ic_ways threads can be used, the thread_ic attributes are
+				// used to split the loop range.
+				dim_t jc_packb_start, jc_packb_end;
+				bli_thread_range_sub
+				(
+				  &thread_ic, nc0, NR, FALSE,
+				  &jc_packb_start, &jc_packb_end
+				);
+
+				if ( pc == 0)
+				{
+					pack_b_column_sum = ( int32_t* )( pack_b_buffer_s8s8s32o32 + ( sizeof( int8_t ) * nc0_updated * kc0_updated ) );
+				}
+
+				// Ensure thread ranges are valid, especially cases where no:
+				// of threads available for parallelization are greater than
+				// no: of B panel NR chunks.
+				if ( ( jc_packb_end > jc_packb_start ) &&
+					 ( jc_packb_start < ( jc + nc0 ) ) )
+				{
+					if ( pc == 0 )
+					{
+						for (dim_t idx = jc_packb_start; idx < jc_packb_end; idx++ )
+						{
+							*( pack_b_column_sum + idx ) =  0;
+						}
+					}
+
+					( ( packb_s32_s8 )lcntx->packb_fun_ptr )
+					(
+					  pack_b_buffer_s8s8s32o32 + ( jc_packb_start * kc0_updated ),
+					  pack_b_column_sum + ( cs_b * jc_packb_start ),
+					  ( b + ( rs_b * pc ) + ( cs_b * jc ) +
+					    ( cs_b * jc_packb_start ) ), rs_b,
+					  ( jc_packb_end - jc_packb_start ), kc0,
+					  &rs_b_use, &cs_b_use
+					);
+				}
+				else
+				{
+					lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
+				}
+
+				// All threads in work group should wait till B matrix packing
+				// is completed by the participating threads.
+				bli_thrcomm_barrier
+				(
+				  bli_thread_ocomm_id( &thread_ic ),
+				  &thread->comm[jc_work_id]
+				);
+				b_use = pack_b_buffer_s8s8s32o32;
+
+				post_ops_attr.b_col_sum_vec = pack_b_column_sum;
+			}
+			else if ( mtag_b == REORDERED )
+			{
+				// In multi-threaded scenarios, an extra offset into a given
+				// packed B panel is required, since the jc loop split can
+				// result in per thread start offset inside the panel, instead
+				// of panel boundaries.
+				b_use = b + ( jc_cur_loop * k_updated ) +
+						( n_sub_updated * pc ) +
+						( jc_cur_loop_rem * kc0_updated );
+
+				lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
+
+				post_ops_attr.b_col_sum_vec = ( ( int32_t* )( b + ( k_updated * n_updated ) ) ) + jc;
+			}
+			else
+			{
+				//Unpacked B not supported.
+				return;
+			}
+
+			for ( dim_t ic = ic_start; ic < ic_end; ic += MC )
+			{
+				dim_t mc0 = bli_min( ( ic_end - ic ), MC );
+
+				// Only per thread C matrix is stored in temp buffer, so both
+				// per thread jc and ic start should be normalized to zero.
+				if ( c_downscale == TRUE )
+				{
+					c_use_ic = c_use_jc + ( rs_c_use * ( ic - ic_start ) );
+				}
+				else
+				{
+					c_use_ic = c_use_jc + ( rs_c_use * ic );
+				}
+
+				// Matrix A packed and reordered code path is not triggerred
+				// currently since we do not support it yet.
+				if ( mtag_a == PACK )
+				{
+					mem_a_size_req = sizeof( int8_t ) * mc0 * kc0_updated;
+
+					lpgemm_alloc_mem_panel
+					(
+					  mem_a_size_req, BLIS_BUFFER_FOR_A_BLOCK,
+					  &mem_a, rntm
+					);
+					pack_a_buffer_s8s8s32o32 = ( int8_t* )bli_mem_buffer( &mem_a );
+
+					( ( packa_s32_s8 )lcntx->packa_fun_ptr )
+					(
+					  pack_a_buffer_s8s8s32o32,
+					  ( a + ( rs_a * ic ) + pc ), rs_a,
+					  mc0, kc0,
+					  &rs_a_use, &cs_a_use
+					);
+					a_use = pack_a_buffer_s8s8s32o32;
+					a_block_stride = kc0_updated;
+				}
+
+				else
+				{
+					a_use = a + ( rs_a * ic ) + ( cs_a * pc );
+
+					// Int8 kernel reads 4 elements, totalling 4 bytes in a
+					// single broadcast for use in vnni instruction.
+					// Non vnni based kernel requires update to this code.
+					cs_a_use = 4;
+					a_block_stride = rs_a;
+				}
+
+				post_ops_attr.b_sum_offset = 0;
+
+				for ( dim_t jr = 0; jr < nc0; jr += NR )
+				{
+					dim_t nr0 = bli_min( ( nc0 - jr ), NR );
+
+					// Post ops meta attributes.
+					post_ops_attr.post_op_c_i = ic;
+					post_ops_attr.post_op_c_j = ( jc + jr );
+					post_ops_attr.rs_c_downscale = rs_c_downscale;
+					//post_ops_attr.b_col_sum_vec = ( int32_t* )( b_use + ( rs_b * kc0_updated ) );
+
+					// Reorder/Packed B, Reorder/Packed/Unpacked A call.
+					( ( lpgemm_rowvar_s32_s8 )lcntx->kern_fun_ptr )
+					(
+					  mc0, nr0, kc0,
+					  a_use, rs_a_use, cs_a_use, a_block_stride,
+					  ( b_use + ( jr * kc0_updated ) ), rs_b_use, cs_b_use,
+					  ( c_use_ic + jr ), rs_c_use, 1,
+					  alpha, beta0,
+					  post_op_list, post_ops_attr
+					);
+					post_ops_attr.b_sum_offset += NR;
+				}
+			}
+		}
+		if ( mtag_b == REORDERED )
+		{
+			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
+		}
+	}
+
+	// Release pack buffers.
+	if ( mtag_b == PACK )
+	{
+		// All threads in work group should wait till B matrix usage is
+		// completed by the participating threads.
+		bli_thrcomm_barrier
+		(
+		  bli_thread_ocomm_id( &thread_jc ),
+		  &thread->comm[bli_thread_work_id( &thread_jc)]
+		);
+
+		if ( bli_thread_am_ochief( &thread_ic ) )
+		{
+			if ( bli_mem_is_alloc( &mem_b ) )
+			{
+				bli_membrk_release( rntm, &mem_b );
+			}
+		}
+	}
+	if ( mtag_a == PACK )
+	{
+		if ( bli_mem_is_alloc( &mem_a ) )
+		{
+			bli_membrk_release( rntm, &mem_a );
+		}
+	}
+	if ( c_downscale == TRUE )
+	{
+		if ( bli_mem_is_alloc( &mem_scale_c ) )
+		{
+			bli_membrk_release( rntm, &mem_scale_c );
+		}
+	}
+}
diff --git a/addon/aocl_gemm/frame/s8s8s32/lpgemm_utils_s8.c b/addon/aocl_gemm/frame/s8s8s32/lpgemm_utils_s8.c
new file mode 100644
index 0000000000..dc3413d89d
--- /dev/null
+++ b/addon/aocl_gemm/frame/s8s8s32/lpgemm_utils_s8.c
@@ -0,0 +1,156 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "lpgemm_utils_s8.h"
+
+dim_t get_64byte_aligned_memory_s8
+     (
+       void**  original_memory,
+       void**  aligned_memory,
+       int64_t allocate_size
+     )
+{
+	// Get 64 byte aligned memory.
+	int8_t* t1_original = ( int8_t* ) malloc( allocate_size + 64 );
+	if ( t1_original == NULL )
+	{
+		//Error in malloc.
+		*original_memory = NULL;
+		*aligned_memory = NULL;
+		return -1;
+	}
+
+	int8_t* ta_original = t1_original + 64;
+	ta_original = ta_original - ( ( int64_t )( ta_original ) % 64 );
+
+	*original_memory = t1_original;
+	*aligned_memory = ta_original;
+	return 0;
+}
+
+static lpgemm_obj_t* alloc_lpgemm_obj_t_s8s8s32
+     (
+       dim_t           length,
+       dim_t           width,
+       dim_t           stride,
+       dim_t           elem_size,
+       AOCL_STOR_TAG   stor_scheme,
+       AOCL_MEMORY_TAG mtag
+     )
+{
+	lpgemm_obj_t* obj = ( lpgemm_obj_t* ) malloc( sizeof( lpgemm_obj_t ) );
+
+	if ( obj == NULL )
+	{
+		return NULL; //failure
+	}
+
+	// Allocate aligned buffers.
+	get_64byte_aligned_memory_s8( &obj->storage.origin_buffer,
+			&obj->storage.aligned_buffer,
+			( elem_size * length * width ) );
+
+	if ( obj->storage.origin_buffer == NULL )
+	{
+		// Buffer allocation failed.
+		free( obj );
+		return NULL;
+	}
+
+	obj->length = length;
+	obj->width = width;
+	obj->elem_size = elem_size;
+
+	if ( stor_scheme == ROW_MAJOR )
+	{
+		obj->rs = stride;
+		obj->cs = 4; // 4 elements read at a time.
+	}
+	else if ( stor_scheme == COLUMN_MAJOR )
+	{
+		obj->cs = stride;
+		obj->rs = 1;
+	}
+	obj->mtag = mtag;
+
+	return obj;
+}
+
+lpgemm_obj_t* alloc_unpack_tag_lpgemm_obj_t_s8s8s32
+     (
+       dim_t         length,
+       dim_t         width,
+       dim_t         stride,
+       dim_t         elem_size,
+       AOCL_STOR_TAG stor_scheme
+     )
+{
+	return alloc_lpgemm_obj_t_s8s8s32( length, width, stride, elem_size, stor_scheme, UNPACKED );
+}
+
+lpgemm_obj_t* alloc_pack_tag_lpgemm_obj_t_s8s8s32
+     (
+       dim_t         length,
+       dim_t         width,
+       dim_t         stride,
+       dim_t         elem_size,
+       AOCL_STOR_TAG stor_scheme
+     )
+{
+	return alloc_lpgemm_obj_t_s8s8s32( length, width, stride, elem_size, stor_scheme, PACK );
+}
+
+lpgemm_obj_t* alloc_reorder_tag_lpgemm_obj_t_s8s8s32
+     (
+       dim_t         length,
+       dim_t         width,
+       dim_t         stride,
+       dim_t         elem_size,
+       AOCL_STOR_TAG stor_scheme
+     )
+{
+	// Extra space since packing does width in multiples of 16.
+	dim_t width_reorder = make_multiple_of_n( width, 16 );
+	// Extra space since packing does length in multiples of 4.
+	dim_t length_reorder = make_multiple_of_n( length, 4 );
+
+	return alloc_lpgemm_obj_t_s8s8s32( length_reorder, width_reorder, stride, elem_size, stor_scheme, REORDERED );
+}
+
+void dealloc_lpgemm_obj_t_s8s8s32( lpgemm_obj_t* obj )
+{
+	free( obj->storage.origin_buffer );
+	free( obj );
+}
diff --git a/addon/aocl_gemm/frame/s8s8s32/lpgemm_utils_s8.h b/addon/aocl_gemm/frame/s8s8s32/lpgemm_utils_s8.h
new file mode 100644
index 0000000000..e91d0f8816
--- /dev/null
+++ b/addon/aocl_gemm/frame/s8s8s32/lpgemm_utils_s8.h
@@ -0,0 +1,226 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_UTILS_H_S8
+#define LPGEMM_UTILS_H_S8
+
+#include "lpgemm_types.h"
+
+// Users of this API needs to free the allocated memory on their own.
+dim_t get_64byte_aligned_memory_s8
+     (
+       void**  original_memory,
+       void**  aligned_memory,
+       int64_t allocate_size
+     );
+
+lpgemm_obj_t* alloc_unpack_tag_lpgemm_obj_t_s8s8s32
+     (
+       dim_t         length,
+       dim_t         width,
+       dim_t         stride,
+       dim_t         elem_size,
+       AOCL_STOR_TAG stor_scheme
+     );
+
+lpgemm_obj_t* alloc_pack_tag_lpgemm_obj_t_s8s8s32
+     (
+       dim_t         length,
+       dim_t         width,
+       dim_t         stride,
+       dim_t         elem_size,
+       AOCL_STOR_TAG stor_scheme
+     );
+
+lpgemm_obj_t* alloc_reorder_tag_lpgemm_obj_t_s8s8s32
+     (
+       dim_t         length,
+       dim_t         width,
+       dim_t         stride,
+       dim_t         elem_size,
+       AOCL_STOR_TAG stor_scheme
+     );
+
+void dealloc_lpgemm_obj_t_s8s8s32( lpgemm_obj_t* obj );
+
+BLIS_INLINE void bli_param_map_char_to_lpmtag
+     (
+       char mtag,
+       AOCL_MEMORY_TAG* lp_mtag
+     )
+{
+        if      ( mtag == 'n' || mtag == 'N' ) *lp_mtag = UNPACKED;
+        else if ( mtag == 'p' || mtag == 'P' ) *lp_mtag = PACK;
+        else if ( mtag == 'r' || mtag == 'R' ) *lp_mtag = REORDERED;
+        else
+        {
+                *lp_mtag = UNPACKED;
+        }
+}
+
+BLIS_INLINE void bli_param_map_char_to_lpmat_type
+     (
+       const char mtag,
+       AOCL_MATRIX_TYPE* lp_mat_type
+     )
+{
+        if      ( mtag == 'a' || mtag == 'A' ) *lp_mat_type = A_MATRIX;
+        else if ( mtag == 'b' || mtag == 'B' ) *lp_mat_type = B_MATRIX;
+        else
+        {
+                *lp_mat_type = B_MATRIX;
+        }
+}
+
+BLIS_INLINE dim_t make_multiple_of_n( dim_t k, dim_t n )
+{
+	if ( n <= 0 )
+	{
+		return 0;
+	}
+
+	return ( ( ( k + n - 1 ) / n ) * n );
+}
+
+BLIS_INLINE void lpgemm_alloc_mem_panel
+     (
+       dim_t     size_req,
+       packbuf_t buf_type,
+       mem_t*    mem,
+       rntm_t*   rntm_l
+     )
+{
+	if ( bli_mem_is_unalloc( mem ) )
+	{
+		bli_membrk_acquire_m
+		(
+		  rntm_l,
+		  size_req,
+		  buf_type,
+		  mem
+		);
+	}
+	else
+	{
+		siz_t mem_size = bli_mem_size( mem );
+		if ( mem_size < size_req )
+		{
+			bli_membrk_release( rntm_l, mem );
+			bli_membrk_acquire_m
+			(
+			  rntm_l,
+			  size_req,
+			  buf_type,
+			  mem
+			);
+		}
+	}
+}
+
+BLIS_INLINE dim_t get_Bpanel_width_for_kdim_traversal
+     (
+       dim_t jc,
+       dim_t n,
+       dim_t NC,
+       dim_t NR
+     )
+{
+	dim_t n_mod_NR = n % NR;
+	dim_t n_sub_updated = NC;
+
+	if ( ( n % NC ) != 0 )
+	{
+		// Only applicable to final NC part of jc loop where jc + remaining 
+		// elements is less than NC; or when n < NC in which case panel width
+		// is atmost n.
+		dim_t n_last_loop = ( n / NC ) * NC;
+		if ( jc >= n_last_loop )
+		{
+			n_sub_updated = n - n_last_loop;
+			if ( n_mod_NR != 0 )
+			{
+				n_sub_updated += ( NR - n_mod_NR );
+			}
+		}
+	}
+
+	return n_sub_updated;
+}
+
+BLIS_INLINE void get_B_panel_reordered_start_offset_width
+     (
+       dim_t  jc,
+       dim_t  n,
+       dim_t  NC,
+       dim_t  NR,
+       dim_t* panel_start,
+       dim_t* panel_offset,
+       dim_t* panel_width,
+       dim_t* panel_width_kdim_trav
+     )
+{
+	// Since n dimension is split across threads in units of NR blocks,
+	// it could happen that B matrix chunk for a thread may be part of
+	// two separate NCxKC panels. In this case nc0 is updated such that
+	// the jr loop only accesses the remaining portion of current NCxKC
+	// panel, with the next jc iteration taking care of the other panel.
+	// This ensures that jr loop does not cross panel boundaries.
+	( *panel_start ) = ( jc / NC ) * NC;
+	( *panel_offset ) = jc - ( *panel_start );
+
+	// Check if jc + current_panel_width (nc0) crosses panel boundaries.
+	if ( ( jc + ( *panel_width ) ) > ( ( *panel_start ) + NC ) )
+	{
+		( *panel_width ) = NC - ( *panel_offset );
+	}
+
+	( *panel_width_kdim_trav ) = get_Bpanel_width_for_kdim_traversal
+								 (
+								   jc, n, NC, NR
+								 );
+}
+
+BLIS_INLINE void adjust_B_panel_reordered_jc( dim_t* jc, dim_t panel_start )
+{
+	// Since n dimension is split across threads in units of NR blocks,
+	// it could happen that B matrix chunk for a thread may be part of
+	// two separate NCxKC panels. In this case jc is reset to immediate
+	// previous panel offset so that in the next iteration, the
+	// following panel belonging to the B chunk is accessed. This
+	// ensures that jr loop does not cross panel boundaries.
+	( *jc ) = panel_start;
+}
+
+#endif //LPGEMM_UTILS_H_S8
+
diff --git a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c
index 0c1df5e7c3..32615afc9e 100644
--- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c
+++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -168,11 +168,18 @@ BLIS_INLINE void lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
 
 BLIS_INLINE void lpgemm_adjust_ic_jc_ways
      (
-       dim_t   m,
-       dim_t   n,
+       const dim_t  m,
+       const dim_t  n,
+       const dim_t  k,
+       const dim_t  MC,
+       const dim_t  NC,
+       const dim_t  KC,
+       const dim_t  MR,
+       const dim_t  NR,
        dim_t* n_threads,
        dim_t* ic_ways,
-       dim_t* jc_ways
+       dim_t* jc_ways,
+       dim_t  m_boost
      )
 {
 	const dim_t m_ic = m / ( *ic_ways );
@@ -192,16 +199,56 @@ BLIS_INLINE void lpgemm_adjust_ic_jc_ways
 	const int64_t next_jc_work_per_thread = n_next_jc + m_prev_ic;
 	const int64_t next_ic_work_per_thread = m_next_ic + n_prev_jc;
 
+	const dim_t MCx2 = MC * 2;
+	const dim_t k_factor = k / KC;
+	const dim_t n_jc_modulo_NR = n_jc % NR;
+	const dim_t n_prev_jc_modulo_NR = n_prev_jc % NR;
+
 	bool can_increase_ic = FALSE;
 	bool can_increase_jc = FALSE;
 
-	if ( next_ic_work_per_thread <= cur_work_per_thread )
+	if ( ( ( *ic_ways ) > 1 ) && ( ( *jc_ways ) < ( *n_threads ) ) )
 	{
-		can_increase_ic = TRUE;
+		if ( next_jc_work_per_thread < cur_work_per_thread )
+		{
+			can_increase_jc = TRUE;
+		}
+		// Check whether m_prev_ic remains in good l2 load zone.
+		else if ( ( ( ( m_ic <= MC ) && ( m_prev_ic <= MC ) ) ||
+					( m_ic > MC ) ) &&
+				  ( ( n_jc > NR ) && ( n_next_jc == NR ) ) )
+		{
+			can_increase_jc = TRUE;
+		}
 	}
-	else if ( next_jc_work_per_thread < cur_work_per_thread )
+	if ( ( ( *ic_ways ) < ( *n_threads ) ) && ( ( *jc_ways ) > 1) )
 	{
-		can_increase_jc = TRUE;
+		if ( next_ic_work_per_thread <= cur_work_per_thread )
+		{
+			can_increase_ic = TRUE;
+		}
+		// ic adjustment towards next highest factor if it results in
+		// m_next_ic <= MC. This helps in reducing number of A matrix
+		// loads per thread to l2 from main memory.
+		else if ( ( m_ic > MC ) && ( m_next_ic <= MC ) &&
+				  ( m_next_ic >= MR ) && ( k_factor > 4 ) )
+		{
+			can_increase_ic = TRUE;
+		}
+		// ic adjustment towards next highest factor resulted in better
+		// performance when m is sufficiently larger than n.
+		else if ( ( m > ( m_boost * n ) ) && ( m_ic >= MCx2 ) &&
+				  ( k_factor > 4 ) )
+		{
+			can_increase_ic = TRUE;
+		}
+		// Performance improvement also observed when n_jc is a multiple
+		// of NR.
+		else if ( ( n_jc_modulo_NR != 0 ) && ( n_prev_jc_modulo_NR == 0 ) &&
+				  ( k_factor > 4 ) )
+		{
+			can_increase_ic = TRUE;
+		}
 	}
 
 	if ( can_increase_ic )
@@ -315,8 +362,6 @@ BLIS_INLINE void lpgemm_u8s8s32o32_get_threading
 			// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
 			bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
 
-			lpgemm_adjust_ic_jc_ways( m, n, n_threads, ic_ways, jc_ways );
-
 			lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
 			(
 			  MR, NR, m, n,
@@ -375,7 +420,7 @@ BLIS_INLINE void lpgemm_bf16bf16f32of32_get_threading
 		{
 			// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
 			bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
-			lpgemm_adjust_ic_jc_ways( m, n, n_threads, ic_ways, jc_ways );
+
 			lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
 			(
 			  MR, NR, m, n,
@@ -416,6 +461,13 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
 	const dim_t NT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx );
 	const dim_t KT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx );
 
+	// Query the context for various blocksizes.
+	const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
 	const dim_t MT_2 = MT / 2;
 
 	*n_threads = bli_rntm_num_threads( rntm_g );
@@ -436,7 +488,12 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
 		// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
 		bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
 
-		lpgemm_adjust_ic_jc_ways( m, n, n_threads, ic_ways, jc_ways );
+		lpgemm_adjust_ic_jc_ways
+		(
+		  m, n, k,
+		  MC, NC, KC, MR, NR,
+		  n_threads, ic_ways, jc_ways, 5
+		);
 	}
 	else
 	{
@@ -458,13 +515,126 @@ BLIS_INLINE void lpgemm_f32f32f32of32_get_threading
 	{
 		if ( ( k > page_size_b_floatx2 ) ||
 			 ( ( k <= page_size_b_floatx2 ) &&
-				  ( m_ic > MT_2 ) && ( n_jc >= NT ) ) )
+			   ( m_ic > MT_2 ) && ( n_jc >= NT ) ) )
 		{
+			bli_rntm_set_pack_b( 1, rntm_g );
 			bli_rntm_set_pack_a( 1, rntm_g );
 		}
 	}
 }
 
+BLIS_INLINE void lpgemm_s8s8s32o32_get_threading
+     (
+       dim_t*  n_threads,
+       dim_t*  ic_ways,
+       dim_t*  jc_ways,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       rntm_t* rntm_g
+     )
+{
+	*n_threads = bli_rntm_num_threads( rntm_g );
+	*jc_ways = bli_rntm_jc_ways( rntm_g );
+	*ic_ways = bli_rntm_ic_ways( rntm_g );
+
+	if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
+	{
+		// If BLIS_IC_NT or JC_NT are set.
+		// Default cases.
+ 		*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
+		*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
+
+		*n_threads = ( *jc_ways ) * ( *ic_ways );
+	}
+	else if ( ( *n_threads ) > 1 )
+	{
+
+		dim_t NR = lpgemm_get_block_size_NR_global_cntx( S8S8S32OS32 );
+		dim_t MR = lpgemm_get_block_size_MR_global_cntx( S8S8S32OS32 );
+
+		if ( n <= NR )
+		{
+			// If n is less than micro panel dimension, allocating all threads
+			// to ic resulted in gains.
+			( *ic_ways ) = ( *n_threads );
+			( *jc_ways ) = 1;
+		}
+		else
+		{
+			// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
+			bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
+
+			lpgemm_pnl_wrk_heur_adjust_ic_jc_ways
+			(
+			  MR, NR, m, n,
+			  n_threads, ic_ways, jc_ways
+			);
+		}
+	}
+	else
+	{
+		// Setting all the values to 1 in case n_threads <= 1. This ensures
+		// the threading parameters are valid.
+		*n_threads = 1;
+		*jc_ways = 1;
+		*ic_ways = 1;
+	}
+}
+
+BLIS_INLINE void lpgemm_s8s8s16o16_get_threading
+     (
+       dim_t*  n_threads,
+       dim_t*  ic_ways,
+       dim_t*  jc_ways,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       rntm_t* rntm_g
+     )
+{
+	*n_threads = bli_rntm_num_threads( rntm_g );
+	*jc_ways = bli_rntm_jc_ways( rntm_g );
+	*ic_ways = bli_rntm_ic_ways( rntm_g );
+
+	if ( ( ( *ic_ways ) > 0 ) || ( ( *jc_ways ) > 0 ) )
+	{
+		// If BLIS_IC_NT or JC_NT are set.
+		// Default cases.
+ 		*ic_ways = ( ( *ic_ways ) > 0 ) ? ( *ic_ways ) : 1;
+		*jc_ways = ( ( *jc_ways ) > 0 ) ? ( *jc_ways ) : 1;
+
+		*n_threads = ( *jc_ways ) * ( *ic_ways );
+	}
+	else if ( ( *n_threads ) > 1 )
+	{
+
+		dim_t NR = lpgemm_get_block_size_NR_global_cntx( S8S8S16OS16 );
+
+		if ( n <= NR )
+		{
+			// If n is less than micro panel dimension, allocating all threads
+			// to ic resulted in gains.
+			( *ic_ways ) = ( *n_threads );
+			( *jc_ways ) = 1;
+		}
+		else
+		{
+			// If BLIS_NUM_THREADS are set, generate jc,ic from the same.
+			bli_thread_partition_2x2( ( *n_threads ), m, n, ic_ways, jc_ways );
+		}
+	}
+	else
+	{
+		// Setting all the values to 1 in case n_threads <= 1. This ensures
+		// the threading parameters are valid.
+		*n_threads = 1;
+		*jc_ways = 1;
+		*ic_ways = 1;
+	}
+}
+
+
 #define GEN_LPGEMM_OPENMP_DECORATOR(A_type,B_type,C_type,LPGEMM_SFX) \
 void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
      ( \
@@ -482,9 +652,10 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
        C_type*               c, \
        const dim_t           rs_c, \
        const dim_t           cs_c, \
-       C_type                alpha, \
-       C_type                beta, \
+       const C_type          alpha, \
+       const C_type          beta, \
        rntm_t*               rntm_g, \
+       lpgemm_cntx_t*        lcntx, \
        lpgemm_post_op*       post_op_list, \
        bool                  c_downscale \
      ) \
@@ -546,6 +717,7 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
 		  beta, \
 		  &rntm_l, \
 		  &thread, \
+		  lcntx, \
 		  post_op_list, c_downscale \
 		); \
 	} \
@@ -559,6 +731,8 @@ GEN_LPGEMM_OPENMP_DECORATOR(uint8_t,int8_t,int16_t,u8s8s16o16)
 GEN_LPGEMM_OPENMP_DECORATOR(uint8_t,int8_t,int32_t,u8s8s32o32)
 GEN_LPGEMM_OPENMP_DECORATOR(bfloat16,bfloat16,float,bf16bf16f32of32)
 GEN_LPGEMM_OPENMP_DECORATOR(float,float,float,f32f32f32of32)
+GEN_LPGEMM_OPENMP_DECORATOR(int8_t,int8_t,int32_t,s8s8s32o32)
+GEN_LPGEMM_OPENMP_DECORATOR(int8_t,int8_t,int16_t,s8s8s16o16)
 
 #else
 
@@ -579,9 +753,10 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
        C_type*               c, \
        const dim_t           rs_c, \
        const dim_t           cs_c, \
-       C_type                alpha, \
-       C_type                beta, \
+       const C_type          alpha, \
+       const C_type          beta, \
        rntm_t*               rntm_g, \
+       lpgemm_cntx_t*        lcntx, \
        lpgemm_post_op*       post_op_list, \
        bool                  c_downscale \
      ) \
@@ -622,6 +797,7 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
 	  beta, \
 	  rntm_g, \
 	  &thread, \
+	  lcntx, \
 	  post_op_list, c_downscale \
 	); \
 } \
@@ -630,5 +806,7 @@ GEN_LPGEMM_DECORATOR(uint8_t,int8_t,int16_t,u8s8s16o16)
 GEN_LPGEMM_DECORATOR(uint8_t,int8_t,int32_t,u8s8s32o32)
 GEN_LPGEMM_DECORATOR(bfloat16,bfloat16,float,bf16bf16f32of32)
 GEN_LPGEMM_DECORATOR(float,float,float,f32f32f32of32)
+GEN_LPGEMM_DECORATOR(int8_t,int8_t,int32_t,s8s8s32o32)
+GEN_LPGEMM_DECORATOR(int8_t,int8_t,int16_t,s8s8s16o16)
 
 #endif
diff --git a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h
index 8055d623e6..80c657b230 100644
--- a/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h
+++ b/addon/aocl_gemm/frame/threading/lpgemm_thread_decor_openmp.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -58,9 +58,10 @@ void lpgemm_ ## LPGEMM_SFX ## _openmp_thread_decorator \
        C_type*               c, \
        const dim_t           rs_c, \
        const dim_t           cs_c, \
-       C_type                alpha, \
-       C_type                beta, \
+       const C_type          alpha, \
+       const C_type          beta, \
        rntm_t*               rntm_g, \
+       lpgemm_cntx_t*        lcntx, \
        lpgemm_post_op*       post_op_list, \
        bool                  c_downscale \
      ); \
@@ -69,6 +70,8 @@ GEN_LPGEMM_OPENMP_DECORATOR_FN(uint8_t,int8_t,int16_t,u8s8s16o16)
 GEN_LPGEMM_OPENMP_DECORATOR_FN(uint8_t,int8_t,int32_t,u8s8s32o32)
 GEN_LPGEMM_OPENMP_DECORATOR_FN(bfloat16,bfloat16,float,bf16bf16f32of32)
 GEN_LPGEMM_OPENMP_DECORATOR_FN(float,float,float,f32f32f32of32)
+GEN_LPGEMM_OPENMP_DECORATOR_FN(int8_t,int8_t,int32_t,s8s8s32o32)
+GEN_LPGEMM_OPENMP_DECORATOR_FN(int8_t,int8_t,int16_t,s8s8s16o16)
 
 #else
 
@@ -89,9 +92,10 @@ void lpgemm_ ## LPGEMM_SFX ## _thread_decorator \
        C_type*               c, \
        const dim_t           rs_c, \
        const dim_t           cs_c, \
-       C_type                alpha, \
-       C_type                beta, \
+       const C_type          alpha, \
+       const C_type          beta, \
        rntm_t*               rntm_g, \
+       lpgemm_cntx_t*        lcntx, \
        lpgemm_post_op*       post_op_list, \
        bool                  c_downscale \
      ); \
@@ -100,6 +104,8 @@ GEN_LPGEMM_DECORATOR_FN(uint8_t,int8_t,int16_t,u8s8s16o16)
 GEN_LPGEMM_DECORATOR_FN(uint8_t,int8_t,int32_t,u8s8s32o32)
 GEN_LPGEMM_DECORATOR_FN(bfloat16,bfloat16,float,bf16bf16f32of32)
 GEN_LPGEMM_DECORATOR_FN(float,float,float,f32f32f32of32)
+GEN_LPGEMM_DECORATOR_FN(int8_t,int8_t,int32_t,s8s8s32o32)
+GEN_LPGEMM_DECORATOR_FN(int8_t,int8_t,int16_t,s8s8s16o16)
 
 #endif
 
diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.c b/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.c
index 0b55f31215..2786117131 100644
--- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.c
+++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,19 +39,23 @@
 
 void aocl_reorderb_nr32_u8s8s16o16
      (
-       lpgemm_obj_t *b,
-       lpgemm_obj_t *b_reorder
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
      )
 {
-	const dim_t NC = lpgemm_get_block_size_NC_global_cntx(U8S8S16OS16);
-	const dim_t KC = lpgemm_get_block_size_KC_global_cntx(U8S8S16OS16);
-	const dim_t NR = lpgemm_get_block_size_NR_global_cntx(U8S8S16OS16);
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t NR = lcntx->blksz.NR;
 
 	// Extracting the matrix properties from the lpgemm object
 	dim_t rs_b = b->rs;
 	dim_t n = b->width;
 	dim_t k = b->length;
 
+	lpgemm_mod_block_size_s16(0, n, k, NULL, &NC, &KC);
+
 	dim_t rs_b_reorder;
 	dim_t cs_b_reorder;
 
@@ -60,12 +64,7 @@ void aocl_reorderb_nr32_u8s8s16o16
 	// Making multiple of 2 to suit k in vpmaddubsw
 	k_updated += (k_updated & 0x1);
 
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_g;
-	bli_rntm_init_from_global( &rntm_g );
-
-	dim_t n_threads = bli_rntm_num_threads( &rntm_g );
+	dim_t n_threads = bli_rntm_num_threads( rntm );
 	n_threads = ( n_threads > 0 ) ? n_threads : 1;
 
 #ifdef BLIS_ENABLE_OPENMP
@@ -146,7 +145,7 @@ void aocl_reorderb_nr32_u8s8s16o16
 				// st = ( jc_cur_loop * k )    <traverse blocks 1,2,3,4>
 				//    + ( n_sub_updated * pc ) <traverse block 5>
 				//    + ( NC' * kc0_updated)   <traverse block 6>
-				packb_nr32_u8s8s16o16
+				( ( packb_s16 )lcntx->packb_fun_ptr )
 				(
 				  ( ( ( int8_t* )b_reorder->storage.aligned_buffer ) +
 					( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.h b/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.h
index 6018978bc7..65647d9903 100644
--- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.h
+++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_reorder_s16.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -38,8 +38,10 @@
 
 void aocl_reorderb_nr32_u8s8s16o16
      (
-       lpgemm_obj_t *b,
-       lpgemm_obj_t *b_reorder
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
      );
 
 #endif // LPGEMM_REORDER_H
diff --git a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c
index b8f5115429..5a03493a44 100644
--- a/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c
+++ b/addon/aocl_gemm/frame/u8s8s16/lpgemm_u8s8s16.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,15 +40,39 @@
 #include "lpgemm_config.h"
 #include "lpgemm_thrinfo_utils.h"
 
+// Kernel function prototypes
+typedef void (*lpgemm_rowvar_s16)
+     (
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const uint8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       int16_t*,
+       const dim_t,
+       const dim_t,
+       const int16_t,
+       const int16_t,
+       lpgemm_post_op*,
+       lpgemm_post_op_attr
+     );
+
 // B should always be packed.
 LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 {
-	const dim_t NC = lpgemm_get_block_size_NC_global_cntx( U8S8S16OS16 );
-	const dim_t KC = lpgemm_get_block_size_KC_global_cntx( U8S8S16OS16 );
-	const dim_t MC = lpgemm_get_block_size_MC_global_cntx( U8S8S16OS16 );
-	const dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S16OS16 );
-	const dim_t MR = lpgemm_get_block_size_MR_global_cntx( U8S8S16OS16 );
-	
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t MC = lcntx->blksz.MC;
+	const dim_t NR = lcntx->blksz.NR;
+	const dim_t MR = lcntx->blksz.MR;
+
+	lpgemm_mod_block_size_s16(m, n, k, &MC, &NC, &KC);
+
 	if (mtag_b == UNPACKED)
 	{
 		// Error: can only work with packed B now.
@@ -82,9 +106,22 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 	// Making multiple of 2 to suit k in vpmaddubsw
 	dim_t k_updated = make_multiple_of_n( k, 2 );
 
-	// Is required to decide whether to apply post ops or not.
+	// To decide whether to apply post ops or not.
 	bool is_last_k = FALSE;
 
+	// To decide whether to use original s8 C or temp buffer for beta scale.
+	bool is_first_k = FALSE;
+
+	lpgemm_post_op_attr post_ops_attr;
+	if ( c_downscale == TRUE )
+	{
+		post_ops_attr.buf_downscale = c;
+	}
+	else
+	{
+		post_ops_attr.buf_downscale = NULL;
+	}
+
 	// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
 	thrinfo_t thread_jc;
 	thrinfo_t thread_ic;
@@ -123,37 +160,24 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 		// Temp accumulaton buffer for C allocation.
 		else if ( c_downscale == TRUE )
 		{
-			mem_scale_c_size_req = sizeof( int16_t ) * nc0 * ( ic_end - ic_start );
-
-			lpgemm_alloc_mem_panel
-			(
-			  mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
-			  &mem_scale_c, rntm
-			);
+			// Buffer memory is only required if output needs to be
+			// persisted across iterations of the pc/KC loop.
+			// It was observed that the locks used while checking out
+			// a buffer from memory pool had an impact on performance
+			// and is better to not checkout if k <= KC.
+			if ( k > KC )
+			{
+				mem_scale_c_size_req = sizeof( int16_t ) * nc0 * ( ic_end - ic_start );
 
-			temp_scal_c_buffer_u8s8s16o16 = bli_mem_buffer( &mem_scale_c );
+				lpgemm_alloc_mem_panel
+				(
+				  mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
+				  &mem_scale_c, rntm
+				);
 
-			c_use_jc = ( int16_t* )temp_scal_c_buffer_u8s8s16o16;
+				temp_scal_c_buffer_u8s8s16o16 = bli_mem_buffer( &mem_scale_c );
 
-			if ( beta != 0 )
-			{
-				dim_t i_temp = 0;
-				dim_t j_temp = 0;
-				// Upscale out C to temporary C matrix.
-				for ( dim_t i_dscale = ic_start; i_dscale < ic_end; ++i_dscale )
-				{
-					j_temp = 0;
-					for ( dim_t j_dscale = jc; j_dscale < ( jc + nc0 ); ++j_dscale )
-					{
-						*( temp_scal_c_buffer_u8s8s16o16 +
-								( nc0 * i_temp ) + j_temp ) =
-								( int16_t )( *( ( ( int8_t* )c ) +
-								( rs_c * i_dscale ) + j_dscale ) );
-
-						j_temp++;
-					}
-					i_temp++;
-				}
+				c_use_jc = ( int16_t* )temp_scal_c_buffer_u8s8s16o16;
 			}
 
 			// The temp c buffer stride is modified as opposed to original C matrix.
@@ -165,7 +189,12 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 			int16_t beta0 = (pc == 0) ? beta : 1;
 			dim_t kc0 = bli_min((k - pc), KC);
 
+			// No parallelization in k dim, k always starts at 0.
+			is_first_k = ( pc == 0 ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_first_k = is_first_k;
+
 			is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_last_k = is_last_k;
 
 			// kc0 needs to be a multiple of 2 so that it can be
 			// used with vpmaddubsw instruction. Padding is added in
@@ -200,9 +229,11 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 
 				// All threads in work group should wait till chief thread has
 				// finished allocating the packing buffers.
-				bli_thrcomm_barrier(
-					bli_thread_ocomm_id(&thread_ic),
-					&thread->comm[jc_work_id]);
+				bli_thrcomm_barrier
+				(
+				  bli_thread_ocomm_id(&thread_ic),
+				  &thread->comm[jc_work_id]
+				);
 
 				pack_b_buffer_u8s8s16o16 =
 					(int8_t *)thread->comm[jc_work_id].sent_object;
@@ -224,9 +255,9 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 				if ((jc_packb_end > jc_packb_start) &&
 					(jc_packb_start < (jc + nc0)))
 				{
-					packb_nr32_u8s8s16o16
+					( ( packb_s16 )lcntx->packb_fun_ptr )
 					(
-						pack_b_buffer_u8s8s16o16 + 
+						pack_b_buffer_u8s8s16o16 +
 						 (jc_packb_start * kc0_updated),
 						(b + (rs_b * pc) + (cs_b * jc) +
 						 (cs_b * jc_packb_start)),
@@ -237,7 +268,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 				}
 				else
 				{
-					get_packb_nr32_u8s8s16o16_strides(&rs_b_use, &cs_b_use);
+					lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
 				}
 
 				// All threads in work group should wait till B matrix packing
@@ -260,7 +291,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 						(n_sub_updated * pc) +
 						(jc_cur_loop_rem * kc0_updated);
 
-				get_packb_nr32_u8s8s16o16_strides(&rs_b_use, &cs_b_use);
+				lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
 			}
 			else
 			{
@@ -292,15 +323,20 @@ LPGEMM_5LOOP(uint8_t,int8_t,int16_t,u8s8s16o16)
 				{
 					dim_t nr0 = bli_min((nc0 - jr), NR);
 
+					// Post ops meta attributes.
+					post_ops_attr.post_op_c_i = ic;
+					post_ops_attr.post_op_c_j = ( jc + jr );
+					post_ops_attr.rs_c_downscale = rs_c_downscale;
+
 					// Calls for reorder B
-					lpgemm_rowvar_u8s8s16o16_6x32
+					( ( lpgemm_rowvar_s16 )lcntx->kern_fun_ptr )
 					(
 						mc0, nr0, kc0,
 						a_use, rs_a_use, cs_a_use, a_block_stride,
 						(b_use + (jr * kc0_updated)), rs_b_use, cs_b_use,
 						(c_use_ic + jr), rs_c_use, 1,
 						alpha, beta0,
-					  	is_last_k, ic, ( jc + jr ), post_op_list, rs_c_downscale
+					  	post_op_list, post_ops_attr
 					);
 				}
 			}
diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.c b/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.c
index 746a134100..224e0791ff 100644
--- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.c
+++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,13 +41,15 @@
 
 void reorderb_nr64_u8s8s32o32
      (
-       lpgemm_obj_t* b,
-       lpgemm_obj_t* b_reorder
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
      )
 {
-	dim_t NC = lpgemm_get_block_size_NC_global_cntx( U8S8S32OS32 );
-	dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S32OS32 );
-	dim_t KC = lpgemm_get_block_size_KC_global_cntx( U8S8S32OS32 );
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t NR = lcntx->blksz.NR;
 
 	dim_t rs_b = b->rs;
 	dim_t rs_b_reorder;
@@ -62,12 +64,7 @@ void reorderb_nr64_u8s8s32o32
 	// buffer needs to be updated.
 	dim_t k_updated = make_multiple_of_n( k, 4 );
 
-	// Initialize a local runtime with global settings if necessary. Note
-	// that in the case that a runtime is passed in, we make a local copy.
-	rntm_t rntm_g;
-	bli_rntm_init_from_global( &rntm_g );
-
-	dim_t n_threads = bli_rntm_num_threads( &rntm_g );
+	dim_t n_threads = bli_rntm_num_threads( rntm );
 	n_threads = ( n_threads > 0 ) ? n_threads : 1;
 
 #ifdef BLIS_ENABLE_OPENMP
@@ -148,8 +145,7 @@ void reorderb_nr64_u8s8s32o32
 				// st = ( jc_cur_loop * k )    <traverse blocks 1,2,3,4>
 				//    + ( n_sub_updated * pc ) <traverse block 5>
 				//    + ( NC' * kc0_updated)   <traverse block 6>
-#ifdef BLIS_KERNELS_ZEN4
-				packb_nr64_u8s8s32o32
+				( ( packb_s32 )lcntx->packb_fun_ptr )
 				(
 				  ( ( ( int8_t* )b_reorder->storage.aligned_buffer ) +
 					( jc_cur_loop * k_updated ) + ( n_sub_updated * pc ) +
@@ -158,14 +154,6 @@ void reorderb_nr64_u8s8s32o32
 					( rs_b * pc ) + jc ),
 				  rs_b, nc0, kc0, &rs_b_reorder, &cs_b_reorder
 				);
-#else
-				// Silence compiler warnings.
-				rs_b_reorder = 0;
-				cs_b_reorder = 0;
-				( void )kc0_updated;
-				( void )k_updated;
-				( void )rs_b;
-#endif
 			}
 
 			adjust_B_panel_reordered_jc( &jc, jc_cur_loop );
@@ -179,12 +167,14 @@ void reorderb_nr64_u8s8s32o32
 
 void reordera_mr6_u8s8s32o32
      (
-       lpgemm_obj_t* a,
-       lpgemm_obj_t* a_reorder
+       lpgemm_obj_t*  a,
+       lpgemm_obj_t*  a_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
      )
 {
-	dim_t MC = lpgemm_get_block_size_MC_global_cntx( U8S8S32OS32 );
-	dim_t KC = lpgemm_get_block_size_KC_global_cntx( U8S8S32OS32 );
+	dim_t MC = lcntx->blksz.MC;
+	dim_t KC = lcntx->blksz.KC;
 
 	dim_t rs_a = a->rs;
 	dim_t rs_a_reorder;
@@ -207,21 +197,13 @@ void reordera_mr6_u8s8s32o32
 		{
 			dim_t mc0 = bli_min( ( m - ic ), MC );
 
-#ifdef BLIS_KERNELS_ZEN4
-			packa_k64_u8s8s32o32
+			( ( packa_s32 )lcntx->packa_fun_ptr )
 			(
 			  ( ( ( uint8_t* )a_reorder->storage.aligned_buffer ) + ( pc * m ) +
 				( ic * kc0_updated ) ),
 			  ( ( ( uint8_t* )a->storage.aligned_buffer ) + ( rs_a * ic ) + pc ),
 			  rs_a, mc0, kc0, &rs_a_reorder, &cs_a_reorder
 			);
-#else
-			rs_a_reorder = 0;
-			cs_a_reorder = 0;
-			( void )kc0_updated;
-			( void )rs_a;
-			( void )mc0;
-#endif
 		}
 	}
 
diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.h b/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.h
index eb8dad9cfc..232b02238d 100644
--- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.h
+++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_reorder.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,14 +39,18 @@
 
 void reorderb_nr64_u8s8s32o32
      (
-       lpgemm_obj_t* b,
-       lpgemm_obj_t* b_reorder
+       lpgemm_obj_t*  b,
+       lpgemm_obj_t*  b_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
      );
 
 void reordera_mr6_u8s8s32o32
      (
-       lpgemm_obj_t* a,
-       lpgemm_obj_t* a_reorder
+       lpgemm_obj_t*  a,
+       lpgemm_obj_t*  a_reorder,
+       rntm_t*        rntm,
+       lpgemm_cntx_t* lcntx
      );
 
 #endif //LPGEMM_REORDER_H
diff --git a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c
index 82a745fcf5..feedda0212 100644
--- a/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c
+++ b/addon/aocl_gemm/frame/u8s8s32/lpgemm_u8s8s32.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,14 +41,36 @@
 #include "lpgemm_thrinfo_utils.h"
 #include "lpgemm_config.h"
 
+// Kernel function prototypes
+typedef void (*lpgemm_rowvar_s32)
+     (
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const uint8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       int32_t*,
+       const dim_t,
+       const dim_t,
+       const int32_t,
+       const int32_t,
+       lpgemm_post_op*,
+       lpgemm_post_op_attr
+     );
+
 // B should always be packed.
 LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 {
-	dim_t NC = lpgemm_get_block_size_NC_global_cntx( U8S8S32OS32 );
-	dim_t KC = lpgemm_get_block_size_KC_global_cntx( U8S8S32OS32 );
-	dim_t MC = lpgemm_get_block_size_MC_global_cntx( U8S8S32OS32 );
-	dim_t NR = lpgemm_get_block_size_NR_global_cntx( U8S8S32OS32 );
-	dim_t MR = lpgemm_get_block_size_MR_global_cntx( U8S8S32OS32 );
+	dim_t NC = lcntx->blksz.NC;
+	dim_t KC = lcntx->blksz.KC;
+	dim_t MC = lcntx->blksz.MC;
+	dim_t NR = lcntx->blksz.NR;
+	dim_t MR = lcntx->blksz.MR;
 
 	if ( mtag_b == UNPACKED )
 	{
@@ -93,9 +115,22 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 	// buffer needs to be updated.
 	dim_t k_updated = make_multiple_of_n( k, 4 );
 
-	// Is required to decide whether to apply post ops or not.
+	// To decide whether to apply post ops or not.
 	bool is_last_k = FALSE;
 
+	// To decide whether to use original s8 C or temp buffer for beta scale.
+	bool is_first_k = FALSE;
+
+	lpgemm_post_op_attr post_ops_attr;
+	if ( c_downscale == TRUE )
+	{
+		post_ops_attr.buf_downscale = c;
+	}
+	else
+	{
+		post_ops_attr.buf_downscale = NULL;
+	}
+
 	// Generate thrinfo objects for jc and ic loops from lpgemm_thrinfo_t.
 	thrinfo_t thread_jc;
 	thrinfo_t thread_ic;
@@ -115,7 +150,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 
 		dim_t jc_cur_loop = jc;
 		dim_t jc_cur_loop_rem = 0;
-		dim_t n_sub_updated;
+		dim_t n_sub_updated = 0;
 
 		if ( mtag_b == REORDERED )
 		{
@@ -134,37 +169,24 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 		// Temp accumulaton buffer for C allocation.
 		else if ( c_downscale == TRUE )
 		{
-			mem_scale_c_size_req = sizeof( int32_t ) * nc0 * ( ic_end - ic_start );
-
-			lpgemm_alloc_mem_panel
-			(
-			  mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
-			  &mem_scale_c, rntm
-			);
+			// Buffer memory is only required if output needs to be
+			// persisted across iterations of the pc/KC loop.
+			// It was observed that the locks used while checking out
+			// a buffer from memory pool had an impact on performance
+			// and is better to not checkout if k <= KC.
+			if ( k > KC )
+			{
+				mem_scale_c_size_req = sizeof( int32_t ) * nc0 * ( ic_end - ic_start );
 
-			temp_scal_c_buffer_u8s8s32o32 = bli_mem_buffer( &mem_scale_c );
+				lpgemm_alloc_mem_panel
+				(
+				  mem_scale_c_size_req, BLIS_BUFFER_FOR_C_PANEL,
+				  &mem_scale_c, rntm
+				);
 
-			c_use_jc = ( int32_t* )temp_scal_c_buffer_u8s8s32o32;
+				temp_scal_c_buffer_u8s8s32o32 = bli_mem_buffer( &mem_scale_c );
 
-			if ( beta != 0 )
-			{
-				dim_t i_temp = 0;
-				dim_t j_temp = 0;
-				// Upscale out C to temporary C matrix.
-				for ( dim_t i_dscale = ic_start; i_dscale < ic_end; ++i_dscale )
-				{
-					j_temp = 0;
-					for ( dim_t j_dscale = jc; j_dscale < ( jc + nc0 ); ++j_dscale )
-					{
-						*( temp_scal_c_buffer_u8s8s32o32 +
-								( nc0 * i_temp ) + j_temp ) =
-								( int32_t )( *( ( ( int8_t* )c ) +
-								( rs_c * i_dscale ) + j_dscale ) );
-
-						j_temp++;
-					}
-					i_temp++;
-				}
+				c_use_jc = ( int32_t* )temp_scal_c_buffer_u8s8s32o32;
 			}
 
 			// The temp c buffer stride is modified as opposed to original C matrix.
@@ -183,7 +205,12 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 			// needs to be updated.
 			dim_t kc0_updated = make_multiple_of_n( kc0, 4 );
 
+			// No parallelization in k dim, k always starts at 0.
+			is_first_k = ( pc == 0 ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_first_k = is_first_k;
+
 			is_last_k = ( ( pc + KC ) >= k ) ? ( TRUE ) : ( FALSE );
+			post_ops_attr.is_last_k = is_last_k;
 
 			if ( mtag_b == PACK )
 			{
@@ -239,8 +266,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 				if ( ( jc_packb_end > jc_packb_start ) &&
 					 ( jc_packb_start < ( jc + nc0 ) ) )
 				{
-#ifdef BLIS_KERNELS_ZEN4
-					packb_nr64_u8s8s32o32
+					( ( packb_s32 )lcntx->packb_fun_ptr )
 					(
 					  pack_b_buffer_u8s8s32o32 + ( jc_packb_start * kc0_updated ),
 					  ( b + ( rs_b * pc ) + ( cs_b * jc ) +
@@ -248,11 +274,10 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 					  ( jc_packb_end - jc_packb_start ), kc0,
 					  &rs_b_use, &cs_b_use
 					);
-#endif
 				}
 				else
 				{
-					get_packb_nr64_u8s8s32o32_strides( &rs_b_use, &cs_b_use );
+					lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
 				}
 
 				// All threads in work group should wait till B matrix packing
@@ -274,7 +299,7 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 						( n_sub_updated * pc ) +
 						( jc_cur_loop_rem * kc0_updated );
 
-				get_packb_nr64_u8s8s32o32_strides( &rs_b_use, &cs_b_use );
+				lpgemm_get_packb_strides( lcntx, &rs_b_use, &cs_b_use );
 			}
 			else
 			{
@@ -310,21 +335,19 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 					);
 					pack_a_buffer_u8s8s32o32 = ( uint8_t* )bli_mem_buffer( &mem_a );
 
-#ifdef BLIS_KERNELS_ZEN4
-					packa_k64_u8s8s32o32
+					( ( packa_s32 )lcntx->packa_fun_ptr )
 					(
 					  pack_a_buffer_u8s8s32o32,
 					  ( a + ( rs_a * ic ) + pc ), rs_a,
 					  mc0, kc0,
 					  &rs_a_use, &cs_a_use
 					);
-#endif
 					a_use = pack_a_buffer_u8s8s32o32;
 					a_block_stride = kc0_updated;
 				}
 				else if ( mtag_a == REORDERED )
 				{
-					get_packa_k64_u8s8s32o32_strides( &rs_a_use, &cs_a_use );
+					lpgemm_get_packa_strides( lcntx, &rs_a_use, &cs_a_use );
 					a_use = a + ( pc * m ) + ( kc0_updated * ic );
 					a_block_stride = kc0_updated;
 				}
@@ -343,28 +366,21 @@ LPGEMM_5LOOP(uint8_t,int8_t,int32_t,u8s8s32o32)
 				{
 					dim_t nr0 = bli_min( ( nc0 - jr ), NR );
 
-#ifdef BLIS_KERNELS_ZEN4
+					// Post ops meta attributes.
+					post_ops_attr.post_op_c_i = ic;
+					post_ops_attr.post_op_c_j = ( jc + jr );
+					post_ops_attr.rs_c_downscale = rs_c_downscale;
+
 					// Reorder/Packed B, Reorder/Packed/Unpacked A call.
-					lpgemm_rowvar_u8s8s32o32_6x64
+					( ( lpgemm_rowvar_s32 )lcntx->kern_fun_ptr )
 					(
 					  mc0, nr0, kc0,
 					  a_use, rs_a_use, cs_a_use, a_block_stride,
 					  ( b_use + ( jr * kc0_updated ) ), rs_b_use, cs_b_use,
 					  ( c_use_ic + jr ), rs_c_use, 1,
 					  alpha, beta0,
-					  is_last_k, ic, ( jc + jr ), post_op_list, rs_c_downscale
+					  post_op_list, post_ops_attr
 					);
-#else
-					// Silence compiler warnings.
-					( void )b_use;
-					( void )a_block_stride;
-					( void )rs_c_downscale;
-					( void )is_last_k;
-					( void )c_use_ic;
-					( void )a_use;
-					( void )beta0;
-					( void )nr0;
-#endif
 				}
 			}
 		}
diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c
deleted file mode 100644
index 65a4963dcb..0000000000
--- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c
+++ /dev/null
@@ -1,1146 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS dim_tERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <immintrin.h>
-
-#include "blis.h"
-#include "lpgemm_kernels.h"
-#include "lpgemm_f32_kern_macros.h"
-
-#ifdef BLIS_KERNELS_ZEN4
-// 6x64 bf16 kernel
-LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6x64_DISABLE,
-						  &&POST_OPS_BIAS_6x64,
-						  &&POST_OPS_RELU_6x64,
-						  &&POST_OPS_RELU_SCALE_6x64,
-						  &&POST_OPS_DOWNSCALE_6x64
-						};
-	dim_t MR = 6;
-	dim_t NR = 64;  
-
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	if ( n0 < NR )
-	{
-		dim_t n0_rem = n0 % 16;
-
-		// Split dim_to multiple smaller fringe kernels, so as to maximize
-		// vectorization. Any n0 < NR(64) can be expressed as n0 = 48 + n`
-		// or n0 = 32 + n` or n0 = 16 + n`, where n` < 16.
-		dim_t n0_48 = n0 / 48;
-		dim_t n0_32 = n0 / 32;
-		dim_t n0_16 = n0 / 16;
-
-		// KC when not multiple of 2 will have padding to make it multiple of
-		// 2 in packed buffer. Also the k0 cannot be passed as the updated
-		// value since A matrix is not packed and requires original k0.
-		dim_t k0_updated = k0;
-		k0_updated += (k0_updated & 0x1);
-
-		if ( n0_48 == 1 )
-		{
-			lpgemm_rowvar_bf16bf16f32of32_6x48
-				(
-				 m0, k0,
-				 a, rs_a, cs_a, ps_a,
-				 b, ( ( rs_b / 4 ) * 3 ), cs_b,
-				 c, rs_c,
-				 alpha, beta,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-
-			b = b + ( 48 * k0_updated ); // k0x48 packed contiguosly.
-			c = c + 48;
-			post_op_c_j += 48;
-		}
-
-		else if ( n0_32 == 1 )
-		{
-			lpgemm_rowvar_bf16bf16f32of32_6x32
-				(
-				 m0, k0,
-				 a, rs_a, cs_a, ps_a,
-				 b, ( ( rs_b / 4 ) * 2 ), cs_b,
-				 c, rs_c,
-				 alpha, beta,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-
-			b = b + ( 32 * k0_updated ); // k0x32 packed contiguosly.
-			c = c + 32;
-			post_op_c_j += 32;
-		}
-
-		else if ( n0_16 == 1 )
-		{
-			lpgemm_rowvar_bf16bf16f32of32_6x16
-				(
-				 m0, k0,
-				 a, rs_a, cs_a, ps_a,
-				 b, ( ( rs_b / 4 ) * 1 ), cs_b,
-				 c, rs_c,
-				 alpha, beta,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-
-			b = b + ( 16 * k0_updated ); // k0x16 packed contiguosly.
-			c = c + 16;
-			post_op_c_j += 16;
-		}
-
-		if ( n0_rem > 0 )
-		{
-			lpgemm_rowvar_bf16bf16f32of32_6xlt16
-				(
-				 m0, k0,
-				 a, rs_a, cs_a, ps_a,
-				 b, ( ( rs_b / 4 ) * 1 ), cs_b,
-				 c, rs_c,
-				 alpha, beta, n0_rem,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-
-			// No leftover fringe after this podint.
-		}
-		return;
-	}
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-	__m512bh b3;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-	__m512bh a_bf16_1;
-
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512 c_float_0p0 = _mm512_setzero_ps();
-		__m512 c_float_0p1 = _mm512_setzero_ps();
-		__m512 c_float_0p2 = _mm512_setzero_ps();
-		__m512 c_float_0p3 = _mm512_setzero_ps();
-
-		__m512 c_float_1p0 = _mm512_setzero_ps();
-		__m512 c_float_1p1 = _mm512_setzero_ps();
-		__m512 c_float_1p2 = _mm512_setzero_ps();
-		__m512 c_float_1p3 = _mm512_setzero_ps();
-
-		__m512 c_float_2p0 = _mm512_setzero_ps();
-		__m512 c_float_2p1 = _mm512_setzero_ps();
-		__m512 c_float_2p2 = _mm512_setzero_ps();
-		__m512 c_float_2p3 = _mm512_setzero_ps();
-
-		__m512 c_float_3p0 = _mm512_setzero_ps();
-		__m512 c_float_3p1 = _mm512_setzero_ps();
-		__m512 c_float_3p2 = _mm512_setzero_ps();
-		__m512 c_float_3p3 = _mm512_setzero_ps();
-
-		__m512 c_float_4p0 = _mm512_setzero_ps();
-		__m512 c_float_4p1 = _mm512_setzero_ps();
-		__m512 c_float_4p2 = _mm512_setzero_ps();
-		__m512 c_float_4p3 = _mm512_setzero_ps();
-
-		__m512 c_float_5p0 = _mm512_setzero_ps();
-		__m512 c_float_5p1 = _mm512_setzero_ps();
-		__m512 c_float_5p2 = _mm512_setzero_ps();
-		__m512 c_float_5p3 = _mm512_setzero_ps();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			// The instructions are arranged in a mixed way to reduce data
-			// chain dependencies.
-
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+2]
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )(a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-			b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-			b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );		
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-
-			// Broadcast a[1,kr:kr+2].
-			a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-			c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-			c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-
-			// Broadcast a[2,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-			c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-			c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-
-			// Broadcast a[3,kr:kr+2].
-			a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-			c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-			c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
-
-			// Broadcast a[4,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
-			c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
-			c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-
-			// Broadcast a[5,kr:kr+2].
-			a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-			c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
-			c_float_4p3 = _mm512_dpbf16_ps( c_float_4p3, a_bf16_0, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-63] = a[5,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_1, b0 );
-			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_1, b1 );
-			c_float_5p2 = _mm512_dpbf16_ps( c_float_5p2, a_bf16_1, b2 );
-			c_float_5p3 = _mm512_dpbf16_ps( c_float_5p3, a_bf16_1, b3 );
-		}           
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+2].
-			memcpy
-				(
-				 &a_kfringe_buf,
-				 ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-				 ( k_partial_pieces * sizeof( bfloat16 ) )
-				);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-			b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-			b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-
-			// Broadcast a[1,kr:kr+2].
-			memcpy
-				(
-				 &a_kfringe_buf,
-				 ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-				 ( k_partial_pieces * sizeof( bfloat16 ) )
-				);
-			a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-			c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-			c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-
-			// Broadcast a[2,kr:kr+2].
-			memcpy
-				(
-				 &a_kfringe_buf,
-				 ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-				 ( k_partial_pieces * sizeof( bfloat16 ) )
-				);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-			c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-			c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-
-			// Broadcast a[3,kr:kr+2].
-			memcpy
-				(
-				 &a_kfringe_buf,
-				 ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-				 ( k_partial_pieces * sizeof( bfloat16 ) )
-				);
-			a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-			c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-			c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
-
-			// Broadcast a[4,kr:kr+2].
-			memcpy
-				(
-				 &a_kfringe_buf,
-				 ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-				 ( k_partial_pieces * sizeof( bfloat16 ) )
-				);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
-			c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
-			c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-
-			// Broadcast a[5,kr:kr+2].
-			memcpy
-				(
-				 &a_kfringe_buf,
-				 ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-				 ( k_partial_pieces * sizeof( bfloat16 ) )
-				);
-			a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-			c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
-			c_float_4p3 = _mm512_dpbf16_ps( c_float_4p3, a_bf16_0, b3 );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-63] = a[5,kr:kr+2]*b[kr:kr+2,0-63]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_1, b0 );
-			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_1, b1 );
-			c_float_5p2 = _mm512_dpbf16_ps( c_float_5p2, a_bf16_1, b2 );
-			c_float_5p3 = _mm512_dpbf16_ps( c_float_5p3, a_bf16_1, b3 );
-		}
-
-		// Load alpha and beta
-		__m512 selector1 = _mm512_set1_ps ( alpha );
-		__m512 selector2 = _mm512_set1_ps ( beta );
-
-		// Scale by alpha
-		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-		c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
-
-		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-		c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
-
-		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-		c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
-		c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 );
-
-		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-		c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
-		c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 );
-
-		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-		c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
-		c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 );
-		c_float_4p3 = _mm512_mul_ps( selector1, c_float_4p3 );
-
-		c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
-		c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 );
-		c_float_5p2 = _mm512_mul_ps( selector1, c_float_5p2 );
-		c_float_5p3 = _mm512_mul_ps( selector1, c_float_5p3 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			// c[0,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[0,48-63]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-			// c[1,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-			// c[1,48-63]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-			// c[2,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 );
-
-			// c[2,48-63]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p3 = _mm512_add_ps( selector1, c_float_2p3 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-
-			// c[3,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 );
-
-			// c[3,48-63]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p3 = _mm512_add_ps( selector1, c_float_3p3 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-			// c[4,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 );
-
-			// c[4,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p2 = _mm512_add_ps( selector1, c_float_4p2 );
-
-			// c[4,48-63]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p3 = _mm512_add_ps( selector1, c_float_4p3 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-
-			// c[5,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p1 = _mm512_add_ps( selector1, c_float_5p1 );
-
-			// c[5,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p2 = _mm512_add_ps( selector1, c_float_5p2 );
-
-			// c[5,48-63]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p3 = _mm512_add_ps( selector1, c_float_5p3 );			
-		}
-		// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6x64:
-		{
-			__m512 selector3;
-			__m512 selector4;
-
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				selector1 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 0 * 16 ) );
-				selector2 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 1 * 16 ) );
-				selector3 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 2 * 16 ) );
-				selector4 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 3 * 16 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[0, 16-31]
-				c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-				// c[0,32-47]
-				c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-				// c[0,48-63]
-				c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-				// c[1, 16-31]
-				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-				// c[1,32-47]
-				c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-				// c[1,48-63]
-				c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-				// c[2, 16-31]
-				c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-				// c[2,32-47]
-				c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-				// c[2,48-63]
-				c_float_2p3 = _mm512_add_ps( selector4, c_float_2p3 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-				// c[3, 16-31]
-				c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-
-				// c[3,32-47]
-				c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
-
-				// c[3,48-63]
-				c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-				// c[4, 16-31]
-				c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
-
-				// c[4,32-47]
-				c_float_4p2 = _mm512_add_ps( selector3, c_float_4p2 );
-
-				// c[4,48-63]
-				c_float_4p3 = _mm512_add_ps( selector4, c_float_4p3 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-
-				// c[5, 16-31]
-				c_float_5p1 = _mm512_add_ps( selector2, c_float_5p1 );
-
-				// c[5,32-47]
-				c_float_5p2 = _mm512_add_ps( selector3, c_float_5p2 );
-
-				// c[5,48-63]
-				c_float_5p3 = _mm512_add_ps( selector4, c_float_5p3 );
-			}
-			else
-			{
-				// If original output was columns major, then by the time
-				// kernel sees it, the matrix would be accessed as if it were
-				// transposed. Due to this the bias array will be accessed by
-				// the ic index, and each bias element corresponds to an
-				// entire row of the transposed output array, instead of an
-				// entire column.
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_i + 1 ) );
-				selector3 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_i + 2 ) );
-				selector4 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_i + 3 ) );
-				__m512 selector5 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_i + 4 ) );
-				__m512 selector6 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-									post_op_c_i + 5 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[0, 16-31]
-				c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-				// c[0,32-47]
-				c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-				// c[0,48-63]
-				c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-				// c[1, 16-31]
-				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-				// c[1,32-47]
-				c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-				// c[1,48-63]
-				c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-				// c[2, 16-31]
-				c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-				// c[2,32-47]
-				c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-				// c[2,48-63]
-				c_float_2p3 = _mm512_add_ps( selector3, c_float_2p3 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-				// c[3, 16-31]
-				c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-
-				// c[3,32-47]
-				c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
-
-				// c[3,48-63]
-				c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-
-				// c[4, 16-31]
-				c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
-
-				// c[4,32-47]
-				c_float_4p2 = _mm512_add_ps( selector5, c_float_4p2 );
-
-				// c[4,48-63]
-				c_float_4p3 = _mm512_add_ps( selector5, c_float_4p3 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
-
-				// c[5, 16-31]
-				c_float_5p1 = _mm512_add_ps( selector6, c_float_5p1 );
-
-				// c[5,32-47]
-				c_float_5p2 = _mm512_add_ps( selector6, c_float_5p2 );
-
-				// c[5,48-63]
-				c_float_5p3 = _mm512_add_ps( selector6, c_float_5p3 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6x64:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			// c[1,16-31]
-			c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-			// c[2,16-31]
-			c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
-
-			// c[2,48-63]
-			c_float_2p3 = _mm512_max_ps( selector1, c_float_2p3 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-			// c[3,16-31]
-			c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
-
-			// c[3,48-63]
-			c_float_3p3 = _mm512_max_ps( selector1, c_float_3p3 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-			// c[4,16-31]
-			c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
-
-			// c[4,32-47]
-			c_float_4p2 = _mm512_max_ps( selector1, c_float_4p2 );
-
-			// c[4,48-63]
-			c_float_4p3 = _mm512_max_ps( selector1, c_float_4p3 );
-
-			// c[5,0-15]
-			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
-
-			// c[5,16-31]
-			c_float_5p1 = _mm512_max_ps( selector1, c_float_5p1 );
-
-			// c[5,32-47]
-			c_float_5p2 = _mm512_max_ps( selector1, c_float_5p2 );
-
-			// c[5,48-63]
-			c_float_5p3 = _mm512_max_ps( selector1, c_float_5p3 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6x64:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[0, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-			// c[0, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-			// c[0, 48-63]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p3)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			// c[1, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-			// c[1, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-			// c[1, 48-63]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p3)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-			// c[2, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-			// c[2, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p2)
-
-			// c[2, 48-63]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p3)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-			// c[3, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-			// c[3, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p2)
-
-			// c[3, 48-63]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p3)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-			// c[4, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p1)
-
-			// c[4, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p2)
-
-			// c[4, 48-63]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p3)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
-
-			// c[5, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p1)
-
-			// c[5, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p2)
-
-			// c[5, 48-63]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p3)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}   
-POST_OPS_DOWNSCALE_6x64:
-{
-	        // c[0, 0-15]
-			CVT_F32_BF16(c_float_0p0,0,0);
-
-			// c[0, 16-31]
-			CVT_F32_BF16(c_float_0p1,0,1);
-
-			// c[0, 32-47]
-			CVT_F32_BF16(c_float_0p2,0,2);
-
-			// c[0, 48-63]
-			CVT_F32_BF16(c_float_0p3,0,3);
-
-			// c[1, 0-15]
-			CVT_F32_BF16(c_float_1p0,1,0);
-
-			// c[1, 16-31]
-			CVT_F32_BF16(c_float_1p1,1,1);
-
-			// c[1, 32-47]
-			CVT_F32_BF16(c_float_1p2,1,2);
-
-			// c[1, 48-63]
-			CVT_F32_BF16(c_float_1p3,1,3);
-
-			// c[2, 0-15]
-			CVT_F32_BF16(c_float_2p0,2,0);
-
-			// c[2, 16-31]
-			CVT_F32_BF16(c_float_2p1,2,1);
-
-			// c[2, 32-47]
-			CVT_F32_BF16(c_float_2p2,2,2);
-
-			// c[2, 48-63]
-			CVT_F32_BF16(c_float_2p3,2,3);
-
-			// c[3, 0-15]
-			CVT_F32_BF16(c_float_3p0,3,0);
-
-			// c[3, 16-31]
-			CVT_F32_BF16(c_float_3p1,3,1);
-
-			// c[3, 32-47]
-			CVT_F32_BF16(c_float_3p2,3,2);
-
-			// c[3, 48-63]
-			CVT_F32_BF16(c_float_3p3,3,3);
-
-			// c[4, 0-15]
-			CVT_F32_BF16(c_float_4p0,4,0);
-
-			// c[4, 16-31]
-			CVT_F32_BF16(c_float_4p1,4,1);
-
-			// c[4, 32-47]
-			CVT_F32_BF16(c_float_4p2,4,2);
-
-			// c[4, 48-63]
-			CVT_F32_BF16(c_float_4p3,4,3);
-
-			// c[5, 0-15]
-			CVT_F32_BF16(c_float_5p0,5,0);
-
-			// c[5, 16-31]
-			CVT_F32_BF16(c_float_5p1,5,1);
-
-			// c[5, 32-47]
-			CVT_F32_BF16(c_float_5p2,5,2);
-
-			// c[5, 48-63]
-			CVT_F32_BF16(c_float_5p3,5,3);
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-}
-
-POST_OPS_6x64_DISABLE:
-		;
-
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 );
-
-		// c[0, 16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 );
-
-		// c[0,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_float_0p2 );
-
-		// c[0,48-63]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_float_0p3 );
-
-		// c[1,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 );
-
-		// c[1,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 );
-
-		// c[1,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_float_1p2 );
-
-		// c[1,48-63]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_float_1p3 );
-
-		// c[2,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 );
-
-		// c[2,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 );
-
-		// c[2,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_float_2p2 );
-
-		// c[2,48-63]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_float_2p3 );
-
-		// c[3,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 );
-
-		// c[3,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 );
-
-		// c[3,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_float_3p2 );
-
-		// c[3,48-63]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_float_3p3 );
-
-		// c[4,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 );
-
-		// c[4,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 );
-
-		// c[4,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_float_4p2 );
-
-		// c[4,48-63]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_float_4p3 );
-
-		// c[5,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 );
-
-		// c[5,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 );
-
-		// c[5,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_float_5p2 );
-
-		// c[5,48-63]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_float_5p3 );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-	}
-
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			// In cases where A matrix is packed cs_a is set to 12, since the
-			// next column in a given row is accessed after 2*6 elements, where
-			// 6 is MR and 2 elements are broadcasted each time from A (bf16).
-			// In fringe case, where m < MR, the next column will be after m'*2
-			// elements, and subsequently following adjustment of cs_a is
-			// required before calling m fringe kernels.
-			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_bf16bf16f32of32_5x64
-				(
-				 k0,
-				 a, rs_a, cs_a_use,
-				 b, rs_b, cs_b,
-				 ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-				 alpha, beta,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-		}		
-		else if ( m_partial_pieces == 4 )
-		{
-			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_bf16bf16f32of32_4x64
-				(
-				 k0,
-				 a, rs_a, cs_a_use,
-				 b, rs_b, cs_b,
-				 ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-				 alpha, beta,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-		}		
-		else if ( m_partial_pieces == 3 )
-		{
-			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_bf16bf16f32of32_3x64
-				(
-				 k0,
-				 a, rs_a, cs_a_use,
-				 b, rs_b, cs_b,
-				 ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-				 alpha, beta,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-		}		
-		else if ( m_partial_pieces == 2 )
-		{
-			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_bf16bf16f32of32_2x64
-				(
-				 k0,
-				 a, rs_a, cs_a_use,
-				 b, rs_b, cs_b,
-				 ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-				 alpha, beta,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-		}		
-		else if ( m_partial_pieces == 1 )
-		{
-			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_bf16bf16f32of32_1x64
-				(
-				 k0,
-				 a, rs_a, cs_a_use,
-				 b, rs_b, cs_b,
-				 ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-				 alpha, beta,
-				 is_last_k,
-			     post_op_c_i, post_op_c_j,
-			     post_ops_list, rs_c_downscale
-				);
-		}		
-	}
-}
-#endif
diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c
deleted file mode 100644
index 1a37ab071a..0000000000
--- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c
+++ /dev/null
@@ -1,2502 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#include <immintrin.h>
-#include <string.h>
-
-#include "blis.h"
-#include "lpgemm_kernels.h"
-#include "lpgemm_f32_kern_macros.h"
-
-#ifdef BLIS_KERNELS_ZEN4
-// 6xlt16 bf16 fringe kernel
-LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6xLT16_DISABLE,
-						  &&POST_OPS_BIAS_6xLT16,
-						  &&POST_OPS_RELU_6xLT16,
-						  &&POST_OPS_RELU_SCALE_6xLT16,
-						  &&POST_OPS_DOWNSCALE_6xLT16
-						};
-	dim_t MR = 6;
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-    // B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// For corner cases.
-	float buf0[16];
-	float buf1[16];
-	float buf2[16];
-	float buf3[16];
-	float buf4[16];
-	float buf5[16];
-
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512 c_float_0p0 = _mm512_setzero_ps();
-
-		__m512 c_float_1p0 = _mm512_setzero_ps();
-
-		__m512 c_float_2p0 = _mm512_setzero_ps();
-		
-		__m512 c_float_3p0 = _mm512_setzero_ps();
-
-		__m512 c_float_4p0 = _mm512_setzero_ps();
-
-		__m512 c_float_5p0 = _mm512_setzero_ps();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			// Load 2 rows with 16 extended elements each from B to 1 ZMM
-			// registers. It is to be noted that the B matrix is packed for use
-			// in bf16 instructions and each load to ZMM register will have 2
-			// elements along k direction and 16 elements across n directions,
-			// so 2x16 elements to a ZMM register.
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			
-			// Broadcast a[1,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			
-			// Broadcast a[2,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			
-			// Broadcast a[3,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			
-			// Broadcast a[4,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-			
-			// Broadcast a[5,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
-		}
-        
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			
-			// Broadcast a[1,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			
-			// Broadcast a[2,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			
-			// Broadcast a[3,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			
-			// Broadcast a[4,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-			
-			// Broadcast a[5,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
-		}
-        
-		// Load alpha and beta
-		__m512 selector1 = _mm512_set1_ps( alpha );
-		__m512 selector2 = _mm512_set1_ps( beta );
-
-		// Scale by alpha
-		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-		
-		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-		
-		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-		
-		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-		
-		c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			memcpy( buf0, ( c + ( rs_c * ( ir + 0 ) ) ), ( n0_rem * sizeof( float ) ) );
-			memcpy( buf1, ( c + ( rs_c * ( ir + 1 ) ) ), ( n0_rem * sizeof( float ) ) );
-			memcpy( buf2, ( c + ( rs_c * ( ir + 2 ) ) ), ( n0_rem * sizeof( float ) ) );
-			memcpy( buf3, ( c + ( rs_c * ( ir + 3 ) ) ), ( n0_rem * sizeof( float) ) );
-			memcpy( buf4, ( c + ( rs_c * ( ir + 4 ) ) ), ( n0_rem * sizeof( float ) ) );
-			memcpy( buf5, ( c + ( rs_c * ( ir + 5 ) ) ), ( n0_rem * sizeof( float ) ) );
-			
-			// c[0,0-15]
-			selector1 = _mm512_loadu_ps( buf0 );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_ps( buf1 );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_ps( buf2 );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_ps( buf3 );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_ps( buf4 );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_ps( buf5  );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-		}
-		// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6xLT16:
-		{
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-				selector1 = _mm512_loadu_ps( buf0 );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 1 ) );
-				__m512 selector3 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 2 ) );
-				__m512 selector4 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 3 ) );
-				__m512 selector5 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 4 ) );
-				__m512 selector6 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 5 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-			// c[5,0-15]
-			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_6xLT16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16_LT16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16_LT16(c_float_1p0,1,0);
-
-		// c[2, 0-15]
-		CVT_F32_BF16_LT16(c_float_2p0,2,0);
-
-		// c[3, 0-15]
-		CVT_F32_BF16_LT16(c_float_3p0,3,0);
-
-		// c[4, 0-15]
-		CVT_F32_BF16_LT16(c_float_4p0,4,0);
-
-		// c[5, 0-15]
-		CVT_F32_BF16_LT16(c_float_5p0,5,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_6xLT16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_ps( buf0, c_float_0p0 );
-
-		// c[1,0-15]
-		_mm512_storeu_ps( buf1, c_float_1p0 );
-
-		// c[2,0-15]
-		_mm512_storeu_ps( buf2, c_float_2p0 );
-
-		// c[3,0-15]
-		_mm512_storeu_ps( buf3, c_float_3p0 );
-
-		// c[4,0-15]
-		_mm512_storeu_ps( buf4, c_float_4p0 );
-
-		// c[5,0-15]
-		_mm512_storeu_ps( buf5, c_float_5p0 );
-
-		// Memcpy partial parts.
-		// c[0,0-15]
-		memcpy( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) );
-
-		// c[1,0-15]
-		memcpy( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) );
-
-		// c[2,0-15]
-		memcpy( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), buf2, ( n0_rem * sizeof( float ) ) );
-
-		// c[3,0-15]
-		memcpy( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), buf3, ( n0_rem * sizeof( float ) ) );
-
-		// c[4,0-15]
-		memcpy( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), buf4, ( n0_rem * sizeof( float ) ) );
-
-		// c[5,0-15]
-		memcpy( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), buf5, ( n0_rem * sizeof( float ) ) );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-	}
-    
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_bf16bf16f32of32_5xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 4 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_bf16bf16f32of32_4xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 3 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_bf16bf16f32of32_3xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 2 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_bf16bf16f32of32_2xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 1 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_bf16bf16f32of32_1xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-	}    
-}
-
-// 6x16 bf16 fringe kernel
-LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6x16_DISABLE,
-						  &&POST_OPS_BIAS_6x16,
-						  &&POST_OPS_RELU_6x16,
-						  &&POST_OPS_RELU_SCALE_6x16,
-						  &&POST_OPS_DOWNSCALE_6x16
-						};
-	dim_t MR = 6;
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-	
-	// B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512 c_float_0p0 = _mm512_setzero_ps();
-
-		__m512 c_float_1p0 = _mm512_setzero_ps();
-
-		__m512 c_float_2p0 = _mm512_setzero_ps();
-		
-		__m512 c_float_3p0 = _mm512_setzero_ps();
-
-		__m512 c_float_4p0 = _mm512_setzero_ps();
-
-		__m512 c_float_5p0 = _mm512_setzero_ps();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			// Load 2 rows with 16 elements each from B to 1 ZMM registers. It
-			// is to be noted that the B matrix is packed for use in bf16
-			// instructions and each load to ZMM register will have 2 elements
-			// along k direction and 16 elements across n directions, so 2x16
-			// elements to a ZMM register.
-		    b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			
-			// Broadcast a[1,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			
-			// Broadcast a[2,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			
-			// Broadcast a[3,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			
-			// Broadcast a[4,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-			
-			// Broadcast a[5,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
-		}
-		// Handle k remainder.
-		
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			
-			// Broadcast a[1,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			
-			// Broadcast a[2,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			
-			// Broadcast a[3,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			
-			// Broadcast a[4,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-			
-			// Broadcast a[5,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
-		}
-		
-		// Load alpha and beta
-		__m512 selector1 = _mm512_set1_ps( alpha );
-		__m512 selector2 = _mm512_set1_ps( beta );
-
-		// Scale by alpha
-		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-		
-		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-		
-		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-		
-		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-		
-		c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			// c[0,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-		}
-		// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6x16:
-		{
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				selector1 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 1 ) );
-				__m512 selector3 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 2 ) );
-				__m512 selector4 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 3 ) );
-				__m512 selector5 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 4 ) );
-				__m512 selector6 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 5 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6x16:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-			// c[5,0-15]
-			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6x16:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_6x16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[4, 0-15]
-		CVT_F32_BF16(c_float_4p0,4,0);
-
-		// c[5, 0-15]
-		CVT_F32_BF16(c_float_5p0,5,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_6x16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 );
-
-		// c[1,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 );
-
-		// c[2,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 );
-
-		// c[3,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 );
-
-		// c[4,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 );
-
-		// c[5,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-	}
-    
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_bf16bf16f32of32_5x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 4 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_bf16bf16f32of32_4x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 3 )
-		{
-			int cs_a_use = ( cs_a == 2) ? 2 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_bf16bf16f32of32_3x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 2 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_bf16bf16f32of32_2x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 1 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_bf16bf16f32of32_1x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-	}	
-}
-
-// 6x32 bf16 fringe kernel
-LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6x32_DISABLE,
-						  &&POST_OPS_BIAS_6x32,
-						  &&POST_OPS_RELU_6x32,
-						  &&POST_OPS_RELU_SCALE_6x32,
-						  &&POST_OPS_DOWNSCALE_6x32
-						};
-	dim_t MR = 6; 
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512 c_float_0p0 = _mm512_setzero_ps();
-		__m512 c_float_0p1 = _mm512_setzero_ps();
-
-		__m512 c_float_1p0 = _mm512_setzero_ps();
-		__m512 c_float_1p1 = _mm512_setzero_ps();
-
-		__m512 c_float_2p0 = _mm512_setzero_ps();
-		__m512 c_float_2p1 = _mm512_setzero_ps();
-		
-		__m512 c_float_3p0 = _mm512_setzero_ps();
-		__m512 c_float_3p1 = _mm512_setzero_ps();
-
-		__m512 c_float_4p0 = _mm512_setzero_ps();
-		__m512 c_float_4p1 = _mm512_setzero_ps();
-
-		__m512 c_float_5p0 = _mm512_setzero_ps();
-		__m512 c_float_5p1 = _mm512_setzero_ps();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			// Load 2 rows with 32 elements each from B to 2 ZMM registers. It
-			// is to be noted that the B matrix is packed for use in bf16
-			// instructions and each load to ZMM register will have 2 elements
-			// along k direction and 32 elements across n directions, so 2x16
-			// elements to a ZMM register.
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-			// Broadcast a[0,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-			
-			// Broadcast a[1,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-			
-			// Broadcast a[2,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-			
-			// Broadcast a[3,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-			
-			// Broadcast a[4,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-			
-			// Broadcast a[5,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-31] = a[5,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
-			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_0, b1 );
-		}		
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-			// Broadcast a[0,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-			
-			// Broadcast a[1,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-			
-			// Broadcast a[2,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-			
-			// Broadcast a[3,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-			
-			// Broadcast a[4,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-			
-			// Broadcast a[5,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-31] = a[5,kr:kr+2]*b[kr:kr+2,0-31]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
-			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_0, b1 );
-		}      
-		// Load alpha and beta
-		__m512 selector1 = _mm512_set1_ps( alpha );
-		__m512 selector2 = _mm512_set1_ps( beta );
-
-		// Scale by alpha
-		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-
-		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-		
-		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-		
-		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-		
-		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-		c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
-		
-		c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
-		c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			// c[0,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-			// c[4,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-
-			// c[5,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p1 = _mm512_add_ps( selector1, c_float_5p1 );
-		}
-		// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6x32:
-		{
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				selector1 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-				selector2 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[0, 16-31]
-				c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-				// c[1, 16-31]
-				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-				// c[2, 16-31]
-				c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-				// c[3, 16-31]
-				c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-				// c[4, 16-31]
-				c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-
-				// c[5, 16-31]
-				c_float_5p1 = _mm512_add_ps( selector2, c_float_5p1 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 1 ) );
-				__m512 selector3 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 2 ) );
-				__m512 selector4 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 3 ) );
-				__m512 selector5 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 4 ) );
-				__m512 selector6 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 5 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[0, 16-31]
-				c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-				// c[1, 16-31]
-				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-				// c[2, 16-31]
-				c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-				// c[3, 16-31]
-				c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-
-				// c[4, 16-31]
-				c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
-
-				// c[5, 16-31]
-				c_float_5p1 = _mm512_add_ps( selector6, c_float_5p1 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6x32:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			// c[1,16-31]
-			c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-			// c[2,16-31]
-			c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-			// c[3,16-31]
-			c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-			// c[4,16-31]
-			c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
-
-			// c[5,0-15]
-			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
-
-			// c[5,16-31]
-			c_float_5p1 = _mm512_max_ps( selector1, c_float_5p1 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6x32:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[0, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			// c[1, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-			// c[2, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-			// c[3, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-			// c[4, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p1)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
-
-			// c[5, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p1)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_6x32:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[3, 16-31]
-		CVT_F32_BF16(c_float_3p1,3,1);
-
-		// c[4, 0-15]
-		CVT_F32_BF16(c_float_4p0,4,0);
-
-		// c[4, 16-31]
-		CVT_F32_BF16(c_float_4p1,4,1);
-
-		// c[5, 0-15]
-		CVT_F32_BF16(c_float_5p0,5,0);
-
-		// c[5, 16-31]
-		CVT_F32_BF16(c_float_5p1,5,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_6x32_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 );
-
-		// c[0, 16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 );
-
-		// c[1,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 );
-
-		// c[1,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 );
-
-		// c[2,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 );
-
-		// c[2,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 );
-
-		// c[3,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 );
-
-		// c[3,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 );
-
-		// c[4,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 );
-
-		// c[4,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 );
-
-		// c[5,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 );
-
-		// c[5,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-	}
-    
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_bf16bf16f32of32_5x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 4 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_bf16bf16f32of32_4x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 3 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_bf16bf16f32of32_3x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 2 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_bf16bf16f32of32_2x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 1 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_bf16bf16f32of32_1x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-	}	
-}
-
-// 6x48 bf16 fringe kernel
-LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6x48_DISABLE,
-						  &&POST_OPS_BIAS_6x48,
-						  &&POST_OPS_RELU_6x48,
-						  &&POST_OPS_RELU_SCALE_6x48,
-						  &&POST_OPS_DOWNSCALE_6x48
-						};
-	dim_t MR = 6;
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-    
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512 c_float_0p0 = _mm512_setzero_ps();
-		__m512 c_float_0p1 = _mm512_setzero_ps();
-		__m512 c_float_0p2 = _mm512_setzero_ps();
-
-		__m512 c_float_1p0 = _mm512_setzero_ps();
-		__m512 c_float_1p1 = _mm512_setzero_ps();
-		__m512 c_float_1p2 = _mm512_setzero_ps();
-
-		__m512 c_float_2p0 = _mm512_setzero_ps();
-		__m512 c_float_2p1 = _mm512_setzero_ps();
-		__m512 c_float_2p2 = _mm512_setzero_ps();
-		
-		__m512 c_float_3p0 = _mm512_setzero_ps();
-		__m512 c_float_3p1 = _mm512_setzero_ps();
-		__m512 c_float_3p2 = _mm512_setzero_ps();
-
-		__m512 c_float_4p0 = _mm512_setzero_ps();
-		__m512 c_float_4p1 = _mm512_setzero_ps();
-		__m512 c_float_4p2 = _mm512_setzero_ps();
-
-		__m512 c_float_5p0 = _mm512_setzero_ps();
-		__m512 c_float_5p1 = _mm512_setzero_ps();
-		__m512 c_float_5p2 = _mm512_setzero_ps();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			
-			// Load 2 rows with 48 elements each from B to 3 ZMM registers. It
-			// is to be noted that the B matrix is packed for use in bf16
-			// instructions and each load to ZMM register will have 2 elements
-			// along k direction and 16 elements across n directions, so 2x16
-			// elements to a ZMM register.
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-			b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-			// Broadcast a[0,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-			c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-			
-			// Broadcast a[1,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-			c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-			
-			// Broadcast a[2,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-			c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-			
-			// Broadcast a[3,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-			c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
-			
-			// Broadcast a[4,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-			c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
-			
-			// Broadcast a[5,kr:kr+2].
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-47] = a[5,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
-			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_0, b1 );
-			c_float_5p2 = _mm512_dpbf16_ps( c_float_5p2, a_bf16_0, b2 );
-
-		}
-		// Handle k remainder.		
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-			b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-			c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-			
-			// Broadcast a[1,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-			c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-			
-			// Broadcast a[2,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-			c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-			
-			// Broadcast a[3,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-			c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
-			
-			// Broadcast a[4,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-			c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
-			
-			// Broadcast a[5,kr:kr+2].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( bfloat16 ) )
-			);
-			a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 2.
-			// c[5,0-47] = a[5,kr:kr+2]*b[kr:kr+2,0-47]
-			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
-			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_0, b1 );
-			c_float_5p2 = _mm512_dpbf16_ps( c_float_5p2, a_bf16_0, b2 );
-		}
-        
-		// Load alpha and beta
-		__m512 selector1 = _mm512_set1_ps( alpha );
-		__m512 selector2 = _mm512_set1_ps( beta );
-
-		// Scale by alpha
-		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-
-		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-		
-		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-		c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
-		
-		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-		c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
-		
-		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-		c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
-		c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 );
-		
-		c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
-		c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 );
-		c_float_5p2 = _mm512_mul_ps( selector1, c_float_5p2 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			// c[0,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-			// c[1,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-			// c[2,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-
-			// c[3,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-			// c[4,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 );
-
-			// c[4,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_4p2 = _mm512_add_ps( selector1, c_float_4p2 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-
-			// c[5,16-31]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p1 = _mm512_add_ps( selector1, c_float_5p1 );
-
-			// c[5,32-47]
-			selector1 = _mm512_loadu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mul_ps( selector2, selector1 );
-			c_float_5p2 = _mm512_add_ps( selector1, c_float_5p2 );
-		}
-		// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6x48:
-		{
-			__m512 selector3;
-
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				selector1 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-				selector2 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-				selector3 =
-					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[0, 16-31]
-				c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-				// c[0,32-47]
-				c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-				// c[1, 16-31]
-				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-				// c[1,32-47]
-				c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-				// c[2, 16-31]
-				c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-				// c[2,32-47]
-				c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-				// c[3, 16-31]
-				c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-
-				// c[3,32-47]
-				c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-				// c[4, 16-31]
-				c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
-
-				// c[4,32-47]
-				c_float_4p2 = _mm512_add_ps( selector3, c_float_4p2 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
-
-				// c[5, 16-31]
-				c_float_5p1 = _mm512_add_ps( selector2, c_float_5p1 );
-
-				// c[5,32-47]
-				c_float_5p2 = _mm512_add_ps( selector3, c_float_5p2 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 1 ) );
-				selector3 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 2 ) );
-				__m512 selector4 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 3 ) );
-				__m512 selector5 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 4 ) );
-				__m512 selector6 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 5 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[0, 16-31]
-				c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-				// c[0,32-47]
-				c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-				// c[1, 16-31]
-				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-				// c[1,32-47]
-				c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-				// c[2, 16-31]
-				c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-				// c[2,32-47]
-				c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-				// c[3, 16-31]
-				c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-
-				// c[3,32-47]
-				c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-
-				// c[4, 16-31]
-				c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
-
-				// c[4,32-47]
-				c_float_4p2 = _mm512_add_ps( selector5, c_float_4p2 );
-
-				// c[5,0-15]
-				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
-
-				// c[5, 16-31]
-				c_float_5p1 = _mm512_add_ps( selector6, c_float_5p1 );
-
-				// c[5,32-47]
-				c_float_5p2 = _mm512_add_ps( selector6, c_float_5p2 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6x48:
-		{
-			//printf("relu\n");
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			// c[1,16-31]
-			c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-			// c[2,16-31]
-			c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-			// c[3,16-31]
-			c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-			// c[4,16-31]
-			c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
-
-			// c[4,32-47]
-			c_float_4p2 = _mm512_max_ps( selector1, c_float_4p2 );
-
-			// c[5,0-15]
-			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
-
-			// c[5,16-31]
-			c_float_5p1 = _mm512_max_ps( selector1, c_float_5p1 );
-
-			// c[5,32-47]
-			c_float_5p2 = _mm512_max_ps( selector1, c_float_5p2 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6x48:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[0, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-			// c[0, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			// c[1, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-			// c[1, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-			// c[2, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-			// c[2, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p2)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-			// c[3, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-			// c[3, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p2)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-			// c[4, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p1)
-
-			// c[4, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p2)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
-
-			// c[5, 16-31]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p1)
-
-			// c[5, 32-47]
-			RELU_SCALE_OP_F32_AVX512(c_float_5p2)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_6x48:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[2, 32-47]
-		CVT_F32_BF16(c_float_2p2,2,2);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[3, 16-31]
-		CVT_F32_BF16(c_float_3p1,3,1);
-
-		// c[3, 32-47]
-		CVT_F32_BF16(c_float_3p2,3,2);
-
-		// c[4, 0-15]
-		CVT_F32_BF16(c_float_4p0,4,0);
-
-		// c[4, 16-31]
-		CVT_F32_BF16(c_float_4p1,4,1);
-
-		// c[4, 32-47]
-		CVT_F32_BF16(c_float_4p2,4,2);
-
-		// c[5, 0-15]
-		CVT_F32_BF16(c_float_5p0,5,0);
-
-		// c[5, 16-31]
-		CVT_F32_BF16(c_float_5p1,5,1);
-
-		// c[5, 32-47]
-		CVT_F32_BF16(c_float_5p2,5,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_6x48_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]	
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 );
-
-		// c[0, 16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 );
-       
-		// c[0,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_float_0p2 );
-        
-		// c[1,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 );
-        
-		// c[1,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 );
-
-		// c[1,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_float_1p2 );
-
-		// c[2,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 );
-
-		// c[2,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 );
-
-		// c[2,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_float_2p2 );
-
-		// c[3,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 );
-
-		// c[3,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 );
-
-		// c[3,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_float_3p2 );
-
-		// c[4,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 );
-
-		// c[4,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 );
-
-		// c[4,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_float_4p2 );
-
-		// c[5,0-15]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 );
-
-		// c[5,16-31]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 );
-
-		// c[5,32-47]
-		_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_float_5p2 );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-		
-	}
-    
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_bf16bf16f32of32_5x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 4 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_bf16bf16f32of32_4x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 3 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_bf16bf16f32of32_3x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 2 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_bf16bf16f32of32_2x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-		else if ( m_partial_pieces == 1 )
-		{
-			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_bf16bf16f32of32_1x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}		
-	}	
-}
-#endif
diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16.h b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16.h
index 07b22a5b25..db5d31e513 100644
--- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16.h
+++ b/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16.h
@@ -1,67 +1,72 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_GEMM_BF16_PACKB
-#define BLIS_GEMM_BF16_PACKB
-
-#include "lpgemm_kernels.h"
-
-BLIS_INLINE dim_t get_packb_bf16bf16f32of32_min_NR()
-{
-	// This is the minimum NR' required for use in bf16bf16f32 kernels. The idea
-	// here is that since k needs to be a multiple of 2 (BF16 instr), NR'=16
-	// results in total of 2 * NR' = 64 bytes to be loaded, which fits in 1 ZMM
-	// register. Thus the smallest n fringe kernel dimension has n=16, and thus
-	// any rounding for buffer sizes should be to 16.
-	return 16;
-}
-
-void get_packb_nr64_bf16bf16f32of32_strides
-     (
-       dim_t* rs_b,
-       dim_t* cs_b
-     );
-
-void packb_nr64_bf16bf16f32of32
-     ( 
-       bfloat16*       pack_b_buffer_bf16bf16f32of32,
-       const bfloat16* b,
-       const dim_t     ldb,
-       const dim_t     NC,
-       const dim_t     KC,
-       dim_t*          rs_b,
-       dim_t*          cs_b
-     );
-
-#endif //BLIS_GEMM_BF16_PACKB
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_GEMM_BF16_PACKB
+#define BLIS_GEMM_BF16_PACKB
+
+#include "aocl_bf16_type.h"
+
+BLIS_INLINE dim_t get_packb_bf16bf16f32of32_min_NR()
+{
+	// This is the minimum NR' required for use in bf16bf16f32 kernels. The idea
+	// here is that since k needs to be a multiple of 2 (BF16 instr), NR'=16
+	// results in total of 2 * NR' = 64 bytes to be loaded, which fits in 1 ZMM
+	// register. Thus the smallest n fringe kernel dimension has n=16, and thus
+	// any rounding for buffer sizes should be to 16.
+	return 16;
+}
+
+typedef void (*packb_bf16)
+     (
+       bfloat16*,
+       const bfloat16*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       dim_t*,
+       dim_t*
+     );
+
+void packb_nr64_bf16bf16f32of32
+     (
+       bfloat16*       pack_b_buffer_bf16bf16f32of32,
+       const bfloat16* b,
+       const dim_t     ldb,
+       const dim_t     NC,
+       const dim_t     KC,
+       dim_t*          rs_b,
+       dim_t*          cs_b
+     );
+
+#endif //BLIS_GEMM_BF16_PACKB
diff --git a/addon/aocl_gemm/kernels/lpgemm_kernels.h b/addon/aocl_gemm/kernels/lpgemm_kernels.h
index 7b73ba27e9..add69df94f 100644
--- a/addon/aocl_gemm/kernels/lpgemm_kernels.h
+++ b/addon/aocl_gemm/kernels/lpgemm_kernels.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -38,54 +38,69 @@
 #include "lpgemm_post_ops.h"
 #include "aocl_bf16_type.h"
 
+typedef void (*lpgemm_m_fringe_f32_ker_ft)
+    (
+       const dim_t         k0,
+       const float*        a,
+       const dim_t         rs_a,
+       const dim_t         cs_a,
+       const float*        b,
+       const dim_t         rs_b,
+       const dim_t         cs_b,
+       float*              c,
+       const dim_t         rs_c,
+       const float         alpha,
+       const float         beta,
+       lpgemm_post_op*     post_ops_list,
+       lpgemm_post_op_attr post_ops_attr 
+    );
+
 #define LPGEMM_MAIN_KERN(A_type,B_type,C_type,LP_SFX) \
 void lpgemm_rowvar_ ## LP_SFX \
      ( \
-       const dim_t     m0, \
-       const dim_t     n0, \
-       const dim_t     k0, \
-       const A_type*   a, \
-       const dim_t     rs_a, \
-       const dim_t     cs_a, \
-       const dim_t     ps_a, \
-       const B_type*   b, \
-       const dim_t     rs_b, \
-       const dim_t     cs_b, \
-       C_type*         c, \
-       const dim_t     rs_c, \
-       const dim_t     cs_c, \
-       const C_type    alpha, \
-       const C_type    beta, \
-       bool            is_last_k, \
-       dim_t           post_op_c_i, \
-       dim_t           post_op_c_j, \
-       lpgemm_post_op* post_ops_list, \
-       const dim_t     rs_c_downscale \
+       const dim_t         m0, \
+       const dim_t         n0, \
+       const dim_t         k0, \
+       const A_type*       a, \
+       const dim_t         rs_a, \
+       const dim_t         cs_a, \
+       const dim_t         ps_a, \
+       const B_type*       b, \
+       const dim_t         rs_b, \
+       const dim_t         cs_b, \
+       C_type*             c, \
+       const dim_t         rs_c, \
+       const dim_t         cs_c, \
+       const C_type        alpha, \
+       const C_type        beta, \
+       lpgemm_post_op*     post_ops_list, \
+       lpgemm_post_op_attr post_ops_attr \
      ) \
 
 LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64);
 LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32);
 LPGEMM_MAIN_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_6x64);
+LPGEMM_MAIN_KERN(float,float,float,f32f32f32of32_6x16m);
+LPGEMM_MAIN_KERN(float,float,float,f32f32f32of32_avx512_6x64m);
+LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64);
+LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32);
 
 #define LPGEMM_M_FRINGE_KERN(A_type,B_type,C_type,LP_SFX) \
 void lpgemm_rowvar_ ## LP_SFX \
      ( \
-       const dim_t     k0, \
-       const A_type*   a, \
-       const dim_t     rs_a, \
-       const dim_t     cs_a, \
-       const B_type*   b, \
-       const dim_t     rs_b, \
-       const dim_t     cs_b, \
-       C_type*         c, \
-       const dim_t     rs_c, \
-       const C_type    alpha, \
-       const C_type    beta, \
-       bool            is_last_k, \
-       dim_t           post_op_c_i, \
-       dim_t           post_op_c_j, \
-       lpgemm_post_op* post_ops_list, \
-       const dim_t     rs_c_downscale \
+       const dim_t         k0, \
+       const A_type*       a, \
+       const dim_t         rs_a, \
+       const dim_t         cs_a, \
+       const B_type*       b, \
+       const dim_t         rs_b, \
+       const dim_t         cs_b, \
+       C_type*             c, \
+       const dim_t         rs_c, \
+       const C_type        alpha, \
+       const C_type        beta, \
+       lpgemm_post_op*     post_ops_list, \
+       lpgemm_post_op_attr post_ops_attr \
      ) \
 
 LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64);
@@ -104,31 +119,81 @@ LPGEMM_M_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_3x64);
 LPGEMM_M_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_2x64);
 LPGEMM_M_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_1x64);
 
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_5x64);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x64);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x64);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x64);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_1x64);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_5x48);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x48);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x48);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x48);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_1x48);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_5x32);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x32);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x32);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x32);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_1x32);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x16);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x16);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x16);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x16);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x16);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x8);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x8);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x8);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x8);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x8);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x4);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x4);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x4);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x4);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x4);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x2);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x2);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x2);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x2);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x2);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x1);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x1);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x1);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x1);
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x1);
+
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64);
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64);
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64);
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64);
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64);
+
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32);
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32);
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32);
+
 #define LPGEMM_N_FRINGE_KERN(A_type,B_type,C_type,LP_SFX) \
 void lpgemm_rowvar_ ## LP_SFX \
      ( \
-       const dim_t     m0, \
-       const dim_t     k0, \
-       const A_type*   a, \
-       const dim_t     rs_a, \
-       const dim_t     cs_a, \
-       const dim_t     ps_a, \
-       const B_type*   b, \
-       const dim_t     rs_b, \
-       const dim_t     cs_b, \
-       C_type*         c, \
-       const dim_t     rs_c, \
-       const C_type    alpha, \
-       const C_type    beta, \
-       bool            is_last_k, \
-       dim_t           post_op_c_i, \
-       dim_t           post_op_c_j, \
-       lpgemm_post_op* post_ops_list, \
-       const dim_t     rs_c_downscale \
+       const dim_t         m0, \
+       const dim_t         k0, \
+       const A_type*       a, \
+       const dim_t         rs_a, \
+       const dim_t         cs_a, \
+       const dim_t         ps_a, \
+       const B_type*       b, \
+       const dim_t         rs_b, \
+       const dim_t         cs_b, \
+       C_type*             c, \
+       const dim_t         rs_c, \
+       const C_type        alpha, \
+       const C_type        beta, \
+       lpgemm_post_op*     post_ops_list, \
+       lpgemm_post_op_attr post_ops_attr \
      ) \
 
 LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16);
+LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16);
 LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32);
+LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32);
 LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48);
 
 LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16);
@@ -137,55 +202,67 @@ LPGEMM_N_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_6x16);
 LPGEMM_N_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_6x32);
 LPGEMM_N_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_6x48);
 
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_6x48m);
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_6x32m);
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_6x8m);
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_6x4m);
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_6x2m);
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_6x1m);
+
+LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16);
+LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32);
+LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48);
+
+LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16);
+
 #define LPGEMM_N_LT_NR0_FRINGE_KERN(A_type,B_type,C_type,LP_SFX) \
 void lpgemm_rowvar_ ## LP_SFX \
      ( \
-       const dim_t     m0, \
-       const dim_t     k0, \
-       const A_type*   a, \
-       const dim_t     rs_a, \
-       const dim_t     cs_a, \
-       const dim_t     ps_a, \
-       const B_type*   b, \
-       const dim_t     rs_b, \
-       const dim_t     cs_b, \
-       C_type*         c, \
-       const dim_t     rs_c, \
-       const C_type    alpha, \
-       const C_type    beta, \
-       const dim_t     n0_rem, \
-       bool            is_last_k, \
-       dim_t           post_op_c_i, \
-       dim_t           post_op_c_j, \
-       lpgemm_post_op* post_ops_list, \
-       const dim_t     rs_c_downscale \
+       const dim_t         m0, \
+       const dim_t         k0, \
+       const A_type*       a, \
+       const dim_t         rs_a, \
+       const dim_t         cs_a, \
+       const dim_t         ps_a, \
+       const B_type*       b, \
+       const dim_t         rs_b, \
+       const dim_t         cs_b, \
+       C_type*             c, \
+       const dim_t         rs_c, \
+       const C_type        alpha, \
+       const C_type        beta, \
+       const dim_t         n0_rem, \
+       lpgemm_post_op*     post_ops_list, \
+       lpgemm_post_op_attr post_ops_attr \
      ) \
 
 LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6xlt16);
+LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12xlt16);
 
 LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16);
 
 LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_6xlt16);
 
+LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6xlt16);
+
+LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16);
+
 #define LPGEMM_MN_FRINGE_KERN(A_type,B_type,C_type,LP_SFX) \
 void lpgemm_rowvar_ ## LP_SFX \
      ( \
-       const dim_t     k0, \
-       const A_type*   a, \
-       const dim_t     rs_a, \
-       const dim_t     cs_a, \
-       const B_type*   b, \
-       const dim_t     rs_b, \
-       const dim_t     cs_b, \
-       C_type*         c, \
-       const dim_t     rs_c, \
-       const C_type    alpha, \
-       const C_type    beta, \
-       bool            is_last_k, \
-       dim_t           post_op_c_i, \
-       dim_t           post_op_c_j, \
-       lpgemm_post_op* post_ops_list, \
-       const dim_t     rs_c_downscale \
+       const dim_t         k0, \
+       const A_type*       a, \
+       const dim_t         rs_a, \
+       const dim_t         cs_a, \
+       const B_type*       b, \
+       const dim_t         rs_b, \
+       const dim_t         cs_b, \
+       C_type*             c, \
+       const dim_t         rs_c, \
+       const C_type        alpha, \
+       const C_type        beta, \
+       lpgemm_post_op*     post_ops_list, \
+       lpgemm_post_op_attr post_ops_attr \
      ) \
 
 LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16);
@@ -224,26 +301,43 @@ LPGEMM_MN_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_3x48);
 LPGEMM_MN_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_2x48);
 LPGEMM_MN_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_1x48);
 
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48);
+
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16);
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16);
+
 #define LPGEMM_MN_LT_NR0_FRINGE_KERN(A_type,B_type,C_type,LP_SFX) \
 void lpgemm_rowvar_ ## LP_SFX \
      ( \
-       const dim_t     k0, \
-       const A_type*   a, \
-       const dim_t     rs_a, \
-       const dim_t     cs_a, \
-       const B_type*   b, \
-       const dim_t     rs_b, \
-       const dim_t     cs_b, \
-       C_type*         c, \
-       const dim_t     rs_c, \
-       const C_type    alpha, \
-       const C_type    beta, \
-       const dim_t     n0_rem, \
-       bool            is_last_k, \
-       dim_t           post_op_c_i, \
-       dim_t           post_op_c_j, \
-       lpgemm_post_op* post_ops_list, \
-       const dim_t     rs_c_downscale \
+       const dim_t         k0, \
+       const A_type*       a, \
+       const dim_t         rs_a, \
+       const dim_t         cs_a, \
+       const B_type*       b, \
+       const dim_t         rs_b, \
+       const dim_t         cs_b, \
+       C_type*             c, \
+       const dim_t         rs_c, \
+       const C_type        alpha, \
+       const C_type        beta, \
+       const dim_t         n0_rem, \
+       lpgemm_post_op*     post_ops_list, \
+       lpgemm_post_op_attr post_ops_attr \
      ) \
 
 LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5xlt16);
@@ -262,4 +356,14 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_3xlt16);
 LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_2xlt16);
 LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16,bfloat16,float,bf16bf16f32of32_1xlt16);
 
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5xlt16);
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4xlt16);
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3xlt16);
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2xlt16);
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1xlt16);
+
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16);
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16);
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16);
+
 #endif //BLIS_LPGEMM_KERN_H
diff --git a/addon/aocl_gemm/kernels/lpgemm_utils_kernels.h b/addon/aocl_gemm/kernels/lpgemm_utils_kernels.h
new file mode 100644
index 0000000000..7849e5a537
--- /dev/null
+++ b/addon/aocl_gemm/kernels/lpgemm_utils_kernels.h
@@ -0,0 +1,63 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_LPGEMM_UTILS_KERN_H
+#define BLIS_LPGEMM_UTILS_KERN_H
+
+typedef void (*lpgemm_util_l1_op_f32_kernel_t)
+     (
+       const dim_t n,
+       float*     x,
+       const inc_t incx
+     );
+
+#define LPGEMM_UTIL_L1_OP_KERNEL(V_type,OP_type) \
+void lpgemm_util_ ## OP_type ## _kernel \
+     ( \
+       const dim_t n, \
+       V_type*     x, \
+       const inc_t incx \
+     ) \
+
+// AVX512
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_gelu_tanh_avx512);
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_gelu_erf_avx512);
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_softmax_avx512);
+
+// AVX2
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_gelu_tanh_avx2);
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_gelu_erf_avx2);
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_softmax_avx2);
+
+#endif //BLIS_LPGEMM_UTILS_KERN_H
diff --git a/addon/aocl_gemm/kernels/s8s8s16/lpgemm_packb_s8s16.h b/addon/aocl_gemm/kernels/s8s8s16/lpgemm_packb_s8s16.h
new file mode 100644
index 0000000000..f3f49e9002
--- /dev/null
+++ b/addon/aocl_gemm/kernels/s8s8s16/lpgemm_packb_s8s16.h
@@ -0,0 +1,63 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_GEMM_S8_INT16_PACKB
+#define BLIS_GEMM_S8_INT16_PACKB
+
+typedef void (*packb_s16_s8)
+     (
+       int8_t*,
+       int16_t*,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       dim_t*,
+       dim_t*
+     );
+
+void packb_nr32_s8s8s16o16
+      (
+        int8_t        *pack_b_buffer_s8s8s16o16,
+        int16_t       *pack_b_column_sum,
+        const int8_t  *b,
+        const dim_t   ldb,
+        const dim_t   cols,
+        const dim_t   rows,
+        dim_t         *rs_b,
+        dim_t         *cs_b
+      );
+
+#endif // BLIS_GEMM_S8_INT16_PACKB
+
diff --git a/addon/aocl_gemm/kernels/s8s8s32/lpgemm_packa_s8.h b/addon/aocl_gemm/kernels/s8s8s32/lpgemm_packa_s8.h
new file mode 100644
index 0000000000..e31c30c563
--- /dev/null
+++ b/addon/aocl_gemm/kernels/s8s8s32/lpgemm_packa_s8.h
@@ -0,0 +1,61 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_GEMM_INT8_PACKA_S8
+#define BLIS_GEMM_INT8_PACKA_S8
+
+typedef void (*packa_s32_s8)
+     (
+       int8_t*,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       dim_t*,
+       dim_t*
+     );
+
+void packa_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    MC,
+       const dim_t    KC,
+       dim_t*         rs_a,
+       dim_t*         cs_a
+     );
+
+#endif //BLIS_GEMM_INT8_PACKA_S8
+
diff --git a/addon/aocl_gemm/kernels/s8s8s32/lpgemm_packb_s8.h b/addon/aocl_gemm/kernels/s8s8s32/lpgemm_packb_s8.h
new file mode 100644
index 0000000000..661c153436
--- /dev/null
+++ b/addon/aocl_gemm/kernels/s8s8s32/lpgemm_packb_s8.h
@@ -0,0 +1,73 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_GEMM_INT8_PACKB_S8
+#define BLIS_GEMM_INT8_PACKB_S8
+
+BLIS_INLINE dim_t get_packb_s8s8s32o32_min_NR()
+{
+	// This is the minimum NR' required for use in u8s8s32 kernels. The idea
+	// here is that since k needs to be a multiple of 4 (VNNI instr), NR'=16
+	// results in total of 4 * NR' = 64 bytes to be loaded, which fits in 1 ZMM
+	// register. Thus the smallest n fringe kernel dimension has n=16, and thus
+	// any rounding for buffer sizes should be to 16.
+	return 16;
+}
+
+typedef void (*packb_s32_s8)
+     (
+       int8_t*,
+       int32_t*,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       dim_t*,
+       dim_t*
+     );
+
+void packb_nr64_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   NC,
+       const dim_t   KC,
+       dim_t*        rs_b,
+       dim_t*        cs_b
+     );
+
+#endif //BLIS_GEMM_INT8_PACKB_S8
+
diff --git a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_s16.h b/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_s16.h
index b8d73c862c..a8f64c3fe0 100644
--- a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_s16.h
+++ b/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_s16.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,10 +35,15 @@
 #ifndef BLIS_GEMM_INT16_PACKB
 #define BLIS_GEMM_INT16_PACKB
 
-void get_packb_nr32_u8s8s16o16_strides
+typedef void (*packb_s16)
      (
-       dim_t* rs_b,
-       dim_t* cs_b
+       int8_t*,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       dim_t*,
+       dim_t*
      );
 
 void packb_nr32_u8s8s16o16
@@ -52,4 +57,4 @@ void packb_nr32_u8s8s16o16
         dim_t         *cs_b
       );
 
-#endif // BLIS_GEMM_INT16_PACKB
\ No newline at end of file
+#endif // BLIS_GEMM_INT16_PACKB
diff --git a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_s16_kern_macros.h b/addon/aocl_gemm/kernels/u8s8s16/lpgemm_s16_kern_macros.h
deleted file mode 100644
index 00583977f3..0000000000
--- a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_s16_kern_macros.h
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-	- Redistributions of source code must retain the above copyright
-	  notice, this list of conditions and the following disclaimer.
-	- Redistributions in binary form must reproduce the above copyright
-	  notice, this list of conditions and the following disclaimer in the
-	  documentation and/or other materials provided with the distribution.
-	- Neither the name(s) of the copyright holder(s) nor the names of its
-	  contributors may be used to endorse or promote products derived
-	  from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef LPGEMM_S16_KERN_MACROS_H
-#define LPGEMM_S16_KERN_MACROS_H
-#define S8_MIN  (-128)
-#define S8_MAX  (+127)
-
-#define RELU_SCALE_OP_S16_AVX2(reg) \
-	selector1 = _mm256_setzero_si256();\
-	selector1 = _mm256_cmpgt_epi16 ( selector1, reg ); \
- \
-	/* Only < 0 elements in b0. */ \
-	b0 = _mm256_and_si256 ( selector1, reg ); \
-\
-	/* Only >= 0 elements in c_int16_0p0. */ \
-	reg = _mm256_andnot_si256( selector1, reg ); \
- \
-	/* Only scaling for < 0 elements. */ \
-	b0 = _mm256_mullo_epi16( b0, selector2 ); \
- \
-	/* Combine the scaled < 0 and >= 0 elements. */ \
-	reg = _mm256_or_si256( b0, reg ); \
- \
-
-//--------------------------------------------------------------------------
-
-#define BLI_MM256_S16_DOWNSCALE(c_int16__p0, c_int16__p1, vec_loc)\
-\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(c_int16__p0, 0);\
-  /* Extract the second 128 bits of the register*/\
-  temp[1] = _mm256_extractf128_si256(c_int16__p0, 1);\
-\
-  temp_32[0] = _mm256_cvtepi16_epi32(temp[0]);\
-  temp_32[1] = _mm256_cvtepi16_epi32(temp[1]);\
-  temp_float[0] = _mm256_cvtepi32_ps(temp_32[0]);\
-  temp_float[1] = _mm256_cvtepi32_ps(temp_32[1]);\
-\
-  /* Multiply the C matrix by the scale value*/\
-  res_1 = _mm256_mul_ps(temp_float[0], scale_1);\
-  res_2 = _mm256_mul_ps(temp_float[1], scale_2);\
-\
-  /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */\
-  res_1 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps(( float )S8_MIN)), _mm256_set1_ps(( float )S8_MAX));\
-  res_2 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps (res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps(( float )S8_MIN)), _mm256_set1_ps(( float )S8_MAX));\
-\
-  /* Convert the clipped float32 scaled rounded value to int32 */\
-  temp_32[0] = _mm256_cvtps_epi32(res_1);\
-  temp_32[1] = _mm256_cvtps_epi32(res_2);\
-\
-  /* Convert the s32 to s16 */\
-	c_int16__p0 = _mm256_packs_epi32(temp_32[0], temp_32[1]);\
-\
-  /*Permute to make sure the order is correct*/\
-	c_int16__p0 = _mm256_permute4x64_epi64(c_int16__p0, 0XD8);\
-\
-   /* Extract the first 128 bits of the register*/\
-	temp[0] = _mm256_extractf128_si256(c_int16__p1, 0);\
-\
-  /* Extract the second 128 bits of the register*/\
-	temp[1] = _mm256_extractf128_si256(c_int16__p1, 1);\
-\
-  temp_32[0] = _mm256_cvtepi16_epi32(temp[0]);\
-  temp_32[1] = _mm256_cvtepi16_epi32(temp[1]);\
-  temp_float[0] = _mm256_cvtepi32_ps(temp_32[0]);\
-  temp_float[1] = _mm256_cvtepi32_ps(temp_32[1]);\
-\
-   /* Multiply the C matrix by the scale value*/\
-  res_1 = _mm256_mul_ps(temp_float[0], scale_1);\
-  res_2 = _mm256_mul_ps(temp_float[1], scale_2);\
-\
-  /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */\
-  res_1 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps (res_1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps(( float )S8_MIN)), _mm256_set1_ps(( float )S8_MAX));\
-  res_2 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps (res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps(( float )S8_MIN)), _mm256_set1_ps(( float )S8_MAX));\
-\
-  /* Convert the clipped float32 scaled rounded value to int32 */\
-  temp_32[0] = _mm256_cvtps_epi32(res_1);\
-  temp_32[1] = _mm256_cvtps_epi32(res_2);\
-\
-  /* Convert the s32 to s16 */\
-	c_int16__p1 = _mm256_packs_epi32(temp_32[0], temp_32[1]);\
-\
-  /*Permute to make sure the order is correct*/\
-	c_int16__p1 = _mm256_permute4x64_epi64(c_int16__p1, 0XD8);\
-\
-   /* Convert the s16 to s8 */\
-	store_reg = _mm256_packs_epi16(c_int16__p0, c_int16__p1);\
-	store_reg = _mm256_permute4x64_epi64(store_reg, 0XD8);\
-\
-  /* Store the result in s8 form */\
-	_mm256_storeu_si256((__m256i *)(( int8_t* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + vec_loc ) ) + post_op_c_j), store_reg);\
-\
-
-//--------------------------------------------------------------------------
-
-#define BLI_MM256_S16_DOWNSCALE2(c_int16__p0, c_int16__p1, vec_loc1, vec_loc2)\
-\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(c_int16__p0, 0);\
-  /* Extract the second 128 bits of the register*/\
-  temp[1] = _mm256_extractf128_si256(c_int16__p0, 1);\
-\
-  temp_32[0] = _mm256_cvtepi16_epi32(temp[0]);\
-  temp_32[1] = _mm256_cvtepi16_epi32(temp[1]);\
-  temp_float[0] = _mm256_cvtepi32_ps(temp_32[0]);\
-  temp_float[1] = _mm256_cvtepi32_ps(temp_32[1]);\
-\
-  /* Multiply the C matrix by the scale value*/\
-  res_1 = _mm256_mul_ps(temp_float[0], scale_1);\
-  res_2 = _mm256_mul_ps(temp_float[1], scale_2);\
-\
-  /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */\
-  res_1 = _mm256_min_ps(_mm256_max_ps \
-         (_mm256_round_ps(res_1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps(( float )S8_MIN)), _mm256_set1_ps(( float )S8_MAX));\
-  res_2 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps(( float )S8_MIN)), _mm256_set1_ps(( float )S8_MAX));\
-\
-  /* Convert the clipped float32 scaled rounded value to int32 */\
-  temp_32[0] = _mm256_cvtps_epi32(res_1);\
-  temp_32[1] = _mm256_cvtps_epi32(res_2);\
-\
-  /* Convert the s32 to s16 */\
-	c_int16__p0 = _mm256_packs_epi32(temp_32[0], temp_32[1]);\
-\
-  /*Permute to make sure the order is correct*/\
-	c_int16__p0 = _mm256_permute4x64_epi64(c_int16__p0, 0XD8);\
-\
-   /* Extract the first 128 bits of the register*/\
-	temp[0] = _mm256_extractf128_si256(c_int16__p1, 0);\
-\
-  /* Extract the second 128 bits of the register*/\
-	temp[1] = _mm256_extractf128_si256(c_int16__p1, 1);\
-\
-  temp_32[0] = _mm256_cvtepi16_epi32(temp[0]);\
-  temp_32[1] = _mm256_cvtepi16_epi32(temp[1]);\
-  temp_float[0] = _mm256_cvtepi32_ps(temp_32[0]);\
-  temp_float[1] = _mm256_cvtepi32_ps(temp_32[1]);\
-\
-   /* Multiply the C matrix by the scale value*/\
-  res_1 = _mm256_mul_ps(temp_float[0], scale_1);\
-  res_2 = _mm256_mul_ps(temp_float[1], scale_2);\
-\
-  /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */\
-  res_1 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps(( float )S8_MIN)), _mm256_set1_ps(( float )S8_MAX));\
-  res_2 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps(( float )S8_MIN)), _mm256_set1_ps(( float )S8_MAX));\
-\
-  /* Convert the clipped float32 scaled rounded value to int32 */\
-  temp_32[0] = _mm256_cvtps_epi32(res_1);\
-  temp_32[1] = _mm256_cvtps_epi32(res_2);\
-\
-  /* Convert the s32 to s16 */\
-	c_int16__p1 = _mm256_packs_epi32(temp_32[0], temp_32[1]);\
-\
-  /*Permute to make sure the order is correct*/\
-	c_int16__p1 = _mm256_permute4x64_epi64(c_int16__p1, 0XD8);\
-\
-   /* Convert the s16 to s8 */\
-	store_reg = _mm256_packs_epi16(c_int16__p0, c_int16__p1);\
-  store_reg = _mm256_permute4x64_epi64(store_reg, 0XD8);\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(store_reg, 0);\
-  /* Extract the second 128 bits of the register*/\
-  temp[1] = _mm256_extractf128_si256(store_reg, 1);\
-\
-  /* Store the result in s8 form */\
-	_mm_storeu_si128((__m128i *)(( int8_t* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + vec_loc1 ) ) + post_op_c_j), temp[0]);\
-  _mm_storeu_si128((__m128i *)(( int8_t* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + vec_loc2 ) ) + post_op_c_j), temp[1]);\
-\
-
-//--------------------------------------------------------------------------
-
-#define BLI_MM256_S16_DOWNSCALE2_LT16(c_int16__p0, c_int16__p1, vec_loc1, vec_loc2)\
-\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(c_int16__p0, 0);\
-  /* Extract the second 128 bits of the register*/\
-  temp[1] = _mm256_extractf128_si256(c_int16__p0, 1);\
-\
-  temp_32[0] = _mm256_cvtepi16_epi32(temp[0]);\
-  temp_32[1] = _mm256_cvtepi16_epi32(temp[1]);\
-  temp_float[0] = _mm256_cvtepi32_ps(temp_32[0]);\
-  temp_float[1] = _mm256_cvtepi32_ps(temp_32[1]);\
-\
-  /* Multiply the C matrix by the scale value*/\
-  res_1 = _mm256_mul_ps(temp_float[0], scale_1);\
-  res_2 = _mm256_mul_ps(temp_float[1], scale_2);\
-\
-  /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */\
-  res_1 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps (( float )S8_MIN)), _mm256_set1_ps (( float )S8_MAX));\
-  res_2 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps (( float )S8_MIN)), _mm256_set1_ps (( float )S8_MAX));\
-\
-  /* Convert the clipped float32 scaled rounded value to int32 */\
-  temp_32[0] = _mm256_cvtps_epi32(res_1);\
-  temp_32[1] = _mm256_cvtps_epi32(res_2);\
-\
-  /* Convert the s32 to s16 */\
-	c_int16__p0 = _mm256_packs_epi32(temp_32[0], temp_32[1]);\
-\
-  /*Permute to make sure the order is correct*/\
-	c_int16__p0 = _mm256_permute4x64_epi64(c_int16__p0, 0XD8);\
-\
-   /* Extract the first 128 bits of the register*/\
-	temp[0] = _mm256_extractf128_si256(c_int16__p1, 0);\
-\
-  /* Extract the second 128 bits of the register*/\
-	temp[1] = _mm256_extractf128_si256(c_int16__p1, 1);\
-\
-  temp_32[0] = _mm256_cvtepi16_epi32(temp[0]);\
-  temp_32[1] = _mm256_cvtepi16_epi32(temp[1]);\
-  temp_float[0] = _mm256_cvtepi32_ps(temp_32[0]);\
-  temp_float[1] = _mm256_cvtepi32_ps(temp_32[1]);\
-\
-   /* Multiply the C matrix by the scale value*/\
-  res_1 = _mm256_mul_ps(temp_float[0], scale_1);\
-  res_2 = _mm256_mul_ps(temp_float[1], scale_2);\
-\
-  /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */\
-  res_1 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps (( float )S8_MIN)), _mm256_set1_ps (( float )S8_MAX));\
-  res_2 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps (( float )S8_MIN)), _mm256_set1_ps (( float )S8_MAX));\
-\
-  /* Convert the clipped float32 scaled rounded value to int32 */\
-  temp_32[0] = _mm256_cvtps_epi32(res_1);\
-  temp_32[1] = _mm256_cvtps_epi32(res_2);\
-\
-  /* Convert the s32 to s16 */\
-	c_int16__p1 = _mm256_packs_epi32(temp_32[0], temp_32[1]);\
-\
-  /*Permute to make sure the order is correct*/\
-	c_int16__p1 = _mm256_permute4x64_epi64(c_int16__p1, 0XD8);\
-\
-   /* Convert the s16 to s8 */\
-	store_reg = _mm256_packs_epi16(c_int16__p0, c_int16__p1);\
-  store_reg = _mm256_permute4x64_epi64(store_reg, 0XD8);\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(store_reg, 0);\
-  /* Extract the second 128 bits of the register*/\
-  temp[1] = _mm256_extractf128_si256(store_reg, 1);\
-\
-  /* Store the result in s8 form */\
-  _mm_storeu_si128((__m128i *)store_buf, temp[0]);\
-  memcpy( ( int8_t* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + vec_loc1 ) ) + post_op_c_j \
-	  , store_buf, ( n0_rem * sizeof( int8_t ) ) ); \
-\
-  _mm_storeu_si128((__m128i *)store_buf, temp[1]);\
-  memcpy( ( int8_t* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + vec_loc2 ) ) + post_op_c_j \
-	  , store_buf, ( n0_rem * sizeof( int8_t ) ) ); \
-\
-
-//--------------------------------------------------------------------------
-
-#define BLI_MM256_S16_DOWNSCALE2_EDGE(c_int16__p0, vec_ind)\
-\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(c_int16__p0, 0);\
-  /* Extract the second 128 bits of the register*/\
-  temp[1] = _mm256_extractf128_si256(c_int16__p0, 1);\
-\
-  temp_32[0] = _mm256_cvtepi16_epi32(temp[0]);\
-  temp_32[1] = _mm256_cvtepi16_epi32(temp[1]);\
-  temp_float[0] = _mm256_cvtepi32_ps(temp_32[0]);\
-  temp_float[1] = _mm256_cvtepi32_ps(temp_32[1]);\
-\
-  /* Multiply the C matrix by the scale value*/\
-  res_1 = _mm256_mul_ps(temp_float[0], scale_1);\
-  res_2 = _mm256_mul_ps(temp_float[1], scale_2);\
-\
-  /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */\
-  res_1 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps (( float )S8_MIN)), _mm256_set1_ps (( float )S8_MAX));\
-  res_2 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps (( float )S8_MIN)), _mm256_set1_ps (( float )S8_MAX));\
-\
-  /* Convert the clipped float32 scaled rounded value to int32 */\
-  temp_32[0] = _mm256_cvtps_epi32(res_1);\
-  temp_32[1] = _mm256_cvtps_epi32(res_2);\
-\
-  /* Convert the s32 to s16 */\
-	c_int16__p0 = _mm256_packs_epi32(temp_32[0], temp_32[1]);\
-\
-  /*Permute to make sure the order is correct*/\
-	c_int16__p0 = _mm256_permute4x64_epi64(c_int16__p0, 0XD8);\
-\
-   /* Convert the s16 to s8 */\
-	store_reg = _mm256_packs_epi16(c_int16__p0, zero_reg);\
-  store_reg = _mm256_permute4x64_epi64(store_reg, 0XD8);\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(store_reg, 0);\
-\
-  /* Store the result in s8 form */\
-	_mm_storeu_si128((__m128i *)(( int8_t* )post_ops_list_temp->op_args3 + \
-	( rs_c_downscale * ( post_op_c_i + vec_ind ) ) + post_op_c_j), temp[0]);\
-\
-
-//--------------------------------------------------------------------------
-
-#define BLI_MM256_S16_DOWNSCALE2_EDGE_LT16(c_int16__p0, vec_ind)\
-\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(c_int16__p0, 0);\
-  /* Extract the second 128 bits of the register*/\
-  temp[1] = _mm256_extractf128_si256(c_int16__p0, 1);\
-\
-  temp_32[0] = _mm256_cvtepi16_epi32(temp[0]);\
-  temp_32[1] = _mm256_cvtepi16_epi32(temp[1]);\
-  temp_float[0] = _mm256_cvtepi32_ps(temp_32[0]);\
-  temp_float[1] = _mm256_cvtepi32_ps(temp_32[1]);\
-\
-  /* Multiply the C matrix by the scale value*/\
-  res_1 = _mm256_mul_ps(temp_float[0], scale_1);\
-  res_2 = _mm256_mul_ps(temp_float[1], scale_2);\
-\
-  /* Round the resultant value to the nearest float value and clip the values between [-128, 127] */\
-  res_1 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_1, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps (( float )S8_MIN)), _mm256_set1_ps (( float )S8_MAX));\
-  res_2 = _mm256_min_ps(_mm256_max_ps \
-          (_mm256_round_ps(res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), \
-          _mm256_set1_ps (( float )S8_MIN)), _mm256_set1_ps (( float )S8_MAX));\
-\
-  /* Convert the clipped float32 scaled rounded value to int32 */\
-  temp_32[0] = _mm256_cvtps_epi32(res_1);\
-  temp_32[1] = _mm256_cvtps_epi32(res_2);\
-\
-  /* Convert the s32 to s16 */\
-	c_int16__p0 = _mm256_packs_epi32(temp_32[0], temp_32[1]);\
-\
-  /*Permute to make sure the order is correct*/\
-	c_int16__p0 = _mm256_permute4x64_epi64(c_int16__p0, 0XD8);\
-\
-   /* Convert the s16 to s8 */\
-	store_reg = _mm256_packs_epi16(c_int16__p0, zero_reg);\
-  store_reg = _mm256_permute4x64_epi64(store_reg, 0XD8);\
-  /* Extract the first 128 bits of the register*/\
-  temp[0] = _mm256_extractf128_si256(store_reg, 0);\
-\
-  /* Store the result in s8 form */\
-  _mm_storeu_si128((__m128i *)store_buf, temp[0]);\
-  memcpy( (( int8_t* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + vec_ind ) ) + post_op_c_j) \
-	  ,store_buf, ( n0_rem * sizeof( int8_t ) ) ); \
-\
-
-#endif //LPGEMM_S16_KERN_MACROS_H
\ No newline at end of file
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_m_fringe_amd512vnni.c b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_m_fringe_amd512vnni.c
deleted file mode 100644
index 1674a22bd0..0000000000
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_m_fringe_amd512vnni.c
+++ /dev/null
@@ -1,2362 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <immintrin.h>
-#include <string.h>
-
-#include "blis.h"
-#include "lpgemm_kernels.h"
-#include "lpgemm_s32_kern_macros.h"
-
-#ifdef BLIS_KERNELS_ZEN4
-// 5x64 int8o32 kernel
-LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5x64_DISABLE,
-						  &&POST_OPS_BIAS_5x64,
-						  &&POST_OPS_RELU_5x64,
-						  &&POST_OPS_RELU_SCALE_5x64,
-						  &&POST_OPS_DOWNSCALE_5x64
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-	__m512i b3;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-	__m512i a_int32_1;
-	
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-	__m512i c_int32_0p3 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-	__m512i c_int32_1p2 = _mm512_setzero_epi32();
-	__m512i c_int32_1p3 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-	__m512i c_int32_2p2 = _mm512_setzero_epi32();
-	__m512i c_int32_2p3 = _mm512_setzero_epi32();
-	
-	__m512i c_int32_3p0 = _mm512_setzero_epi32();
-	__m512i c_int32_3p1 = _mm512_setzero_epi32();
-	__m512i c_int32_3p2 = _mm512_setzero_epi32();
-	__m512i c_int32_3p3 = _mm512_setzero_epi32();
-
-	__m512i c_int32_4p0 = _mm512_setzero_epi32();
-	__m512i c_int32_4p1 = _mm512_setzero_epi32();
-	__m512i c_int32_4p2 = _mm512_setzero_epi32();
-	__m512i c_int32_4p3 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
-		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
-		
-		// Broadcast a[4,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
-		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
-		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
-		c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
-		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
-		
-		// Broadcast a[4,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
-		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
-		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
-		c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-	c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-	c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-	c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
-	c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
-	
-	c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-	c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-	c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
-	c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
-	
-	c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-	c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
-	c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
-	c_int32_4p3 = _mm512_mullo_epi32( selector1, c_int32_4p3 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p3 = _mm512_add_epi32( selector1, c_int32_0p3 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-		// c[1,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p3 = _mm512_add_epi32( selector1, c_int32_1p3 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p2 = _mm512_add_epi32( selector1, c_int32_2p2 );
-
-		// c[2,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p3 = _mm512_add_epi32( selector1, c_int32_2p3 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
-
-		// c[3,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p2 = _mm512_add_epi32( selector1, c_int32_3p2 );
-
-		// c[3,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p3 = _mm512_add_epi32( selector1, c_int32_3p3 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-		// c[4,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p1 = _mm512_add_epi32( selector1, c_int32_4p1 );
-
-		// c[4,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p2 = _mm512_add_epi32( selector1, c_int32_4p2 );
-
-		// c[4,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p3 = _mm512_add_epi32( selector1, c_int32_4p3 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5x64:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 3 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-		// c[1,48-63]
-		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
-
-		// c[2,48-63]
-		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3, 16-31]
-		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
-
-		// c[3,32-47]
-		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
-
-		// c[3,48-63]
-		c_int32_3p3 = _mm512_add_epi32( a_int32_1, c_int32_3p3 );
-
-		// c[4,0-15]
-		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-		// c[4, 16-31]
-		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
-
-		// c[4,32-47]
-		c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
-
-		// c[4,48-63]
-		c_int32_4p3 = _mm512_add_epi32( a_int32_1, c_int32_4p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_5x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-		// c[1,48-63]
-		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
-
-		// c[2,48-63]
-		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
-
-		// c[3,32-47]
-		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
-
-		// c[3,48-63]
-		c_int32_3p3 = _mm512_max_epi32( selector1, c_int32_3p3 );
-
-		// c[4,0-15]
-		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-		// c[4,16-31]
-		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
-
-		// c[4,32-47]
-		c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
-
-		// c[4,48-63]
-		c_int32_4p3 = _mm512_max_epi32( selector1, c_int32_4p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_5x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-		// c[1, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
-
-		// c[2, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
-
-		// c[3, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
-
-		// c[3, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p3)
-
-		// c[4, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-		// c[4, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
-
-		// c[4, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
-
-		// c[4, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_5x64:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 3 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[0, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p3,a_int32_1,0,3);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		// c[1, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p3,a_int32_1,1,3);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[2, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p2,a_int32_0,2,2);
-
-		// c[2, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p3,a_int32_1,2,3);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[3, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
-
-		// c[3, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p2,a_int32_0,3,2);
-
-		// c[3, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p3,a_int32_1,3,3);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p0,selector1,4,0);
-
-		// c[4, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p1,selector2,4,1);
-
-		// c[4, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p2,a_int32_0,4,2);
-
-		// c[4, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p3,a_int32_1,4,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_5x64_DISABLE:
-	;
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
-
-	// c[1,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
-
-	// c[2,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
-
-	// c[3,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
-
-	// c[3,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
-
-	// c[3,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 );
-
-	// c[4,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
-
-	// c[4,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
-
-	// c[4,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 );
-
-	// c[4,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 3*16 ), c_int32_4p3 );
-}
-
-// 4x64 int8o32 kernel
-LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4x64_DISABLE,
-						  &&POST_OPS_BIAS_4x64,
-						  &&POST_OPS_RELU_4x64,
-						  &&POST_OPS_RELU_SCALE_4x64,
-						  &&POST_OPS_DOWNSCALE_4x64
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-	__m512i b3;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-	__m512i a_int32_1;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-	__m512i c_int32_0p3 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-	__m512i c_int32_1p2 = _mm512_setzero_epi32();
-	__m512i c_int32_1p3 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-	__m512i c_int32_2p2 = _mm512_setzero_epi32();
-	__m512i c_int32_2p3 = _mm512_setzero_epi32();
-	
-	__m512i c_int32_3p0 = _mm512_setzero_epi32();
-	__m512i c_int32_3p1 = _mm512_setzero_epi32();
-	__m512i c_int32_3p2 = _mm512_setzero_epi32();
-	__m512i c_int32_3p3 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
-		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-		
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
-		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
-		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
-		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-		
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
-		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
-		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-	c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-	c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-	c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
-	c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
-	
-	c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-	c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-	c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
-	c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p3 = _mm512_add_epi32( selector1, c_int32_0p3 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-		// c[1,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p3 = _mm512_add_epi32( selector1, c_int32_1p3 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p2 = _mm512_add_epi32( selector1, c_int32_2p2 );
-
-		// c[2,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p3 = _mm512_add_epi32( selector1, c_int32_2p3 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
-
-		// c[3,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p2 = _mm512_add_epi32( selector1, c_int32_3p2 );
-
-		// c[3,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p3 = _mm512_add_epi32( selector1, c_int32_3p3 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4x64:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 3 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-		// c[1,48-63]
-		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
-
-		// c[2,48-63]
-		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3, 16-31]
-		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
-
-		// c[3,32-47]
-		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
-
-		// c[3,48-63]
-		c_int32_3p3 = _mm512_add_epi32( a_int32_1, c_int32_3p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_4x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-		// c[1,48-63]
-		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
-
-		// c[2,48-63]
-		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
-
-		// c[3,32-47]
-		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
-
-		// c[3,48-63]
-		c_int32_3p3 = _mm512_max_epi32( selector1, c_int32_3p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_4x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-		// c[1, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
-
-		// c[2, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
-
-		// c[3, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
-
-		// c[3, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_4x64:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 3 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[0, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p3,a_int32_1,0,3);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		// c[1, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p3,a_int32_1,1,3);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[2, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p2,a_int32_0,2,2);
-
-		// c[2, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p3,a_int32_1,2,3);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[3, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
-
-		// c[3, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p2,a_int32_0,3,2);
-
-		// c[3, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p3,a_int32_1,3,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_4x64_DISABLE:
-	;
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
-
-	// c[1,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
-
-	// c[2,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
-
-	// c[3,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
-
-	// c[3,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
-
-	// c[3,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 );
-}
-
-// 3x64 int8o32 kernel
-LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3x64_DISABLE,
-						  &&POST_OPS_BIAS_3x64,
-						  &&POST_OPS_RELU_3x64,
-						  &&POST_OPS_RELU_SCALE_3x64,
-						  &&POST_OPS_DOWNSCALE_3x64
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-	__m512i b3;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-	__m512i a_int32_1;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-	__m512i c_int32_0p3 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-	__m512i c_int32_1p2 = _mm512_setzero_epi32();
-	__m512i c_int32_1p3 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-	__m512i c_int32_2p2 = _mm512_setzero_epi32();
-	__m512i c_int32_2p3 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a *  0 ) + ( cs_a * kr ) ) );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
-		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
-		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-	c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-	c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-	c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
-	c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p3 = _mm512_add_epi32( selector1, c_int32_0p3 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-		// c[1,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p3 = _mm512_add_epi32( selector1, c_int32_1p3 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p2 = _mm512_add_epi32( selector1, c_int32_2p2 );
-
-		// c[2,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p3 = _mm512_add_epi32( selector1, c_int32_2p3 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3x64:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 3 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-		// c[1,48-63]
-		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
-
-		// c[2,48-63]
-		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_3x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-		// c[1,48-63]
-		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
-
-		// c[2,48-63]
-		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_3x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-		// c[1, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
-
-		// c[2, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_3x64:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 3 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[0, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p3,a_int32_1,0,3);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		// c[1, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p3,a_int32_1,1,3);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[2, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p2,a_int32_0,2,2);
-
-		// c[2, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p3,a_int32_1,2,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_3x64_DISABLE:
-	;
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
-
-	// c[1,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
-
-	// c[2,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
-}
-
-// 2x64 int8o32 kernel
-LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2x64_DISABLE,
-						  &&POST_OPS_BIAS_2x64,
-						  &&POST_OPS_RELU_2x64,
-						  &&POST_OPS_RELU_SCALE_2x64,
-						  &&POST_OPS_DOWNSCALE_2x64
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-	__m512i b3;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-	__m512i a_int32_1;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-	__m512i c_int32_0p3 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-	__m512i c_int32_1p2 = _mm512_setzero_epi32();
-	__m512i c_int32_1p3 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-		
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
-		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
-		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-	c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-	c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p3 = _mm512_add_epi32( selector1, c_int32_0p3 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-		// c[1,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p3 = _mm512_add_epi32( selector1, c_int32_1p3 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2x64:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 3 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-		// c[1,48-63]
-		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_2x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-		// c[1,48-63]
-		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_2x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-		// c[1, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_2x64:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 3 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[0, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p3,a_int32_1,0,3);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		// c[1, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p3,a_int32_1,1,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_2x64_DISABLE:
-	;
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
-
-	// c[1,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
-}
-
-// 1x64 int8o32 kernel
-LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1x64_DISABLE,
-						  &&POST_OPS_BIAS_1x64,
-						  &&POST_OPS_RELU_1x64,
-						  &&POST_OPS_RELU_SCALE_1x64,
-						  &&POST_OPS_DOWNSCALE_1x64
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-	__m512i b3;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-	__m512i a_int32_1;
-
-	//  Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-	__m512i c_int32_0p3 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr]
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-                // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( uint8_t ) )
-		);
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 4.
-                // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
-	}
-	
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-	c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
-	
-	// Scale C by beta.
-	if ( beta != 0)
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p3 = _mm512_add_epi32( selector1, c_int32_0p3 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1x64:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 3 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_1x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[0,48-63]
-		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_1x64:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_1x64:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-		a_int32_1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 3 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[0, 48-63]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p3,a_int32_1,0,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_1x64_DISABLE:
-	;
-
-	// Store the accumulated results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
-}
-#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_mn_fringe_amd512vnni.c b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_mn_fringe_amd512vnni.c
deleted file mode 100644
index b202061e6a..0000000000
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_mn_fringe_amd512vnni.c
+++ /dev/null
@@ -1,5283 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <immintrin.h>
-#include <string.h>
-
-#include "blis.h"
-#include "lpgemm_kernels.h"
-#include "lpgemm_s32_kern_macros.h"
-
-#ifdef BLIS_KERNELS_ZEN4
-// 5xlt16 int8o32 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5xLT16_DISABLE,
-						  &&POST_OPS_BIAS_5xLT16,
-						  &&POST_OPS_RELU_5xLT16,
-						  &&POST_OPS_RELU_SCALE_5xLT16,
-						  &&POST_OPS_DOWNSCALE_5xLT16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// For corner cases.
-	int32_t buf0[16];
-	int32_t buf1[16];
-	int32_t buf2[16];
-	int32_t buf3[16];
-	int32_t buf4[16];
-
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_2p0 = _mm512_setzero_epi32();
-		
-		__m512i c_int32_3p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_4p0 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			
-			// Broadcast a[4,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			
-			// Broadcast a[4,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		
-		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-		
-		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-		
-		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-		
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf3, ( c + ( rs_c * 3 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf4, ( c + ( rs_c * 4 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( buf0 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( buf1 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_epi32( buf2 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_epi32( buf3 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_epi32( buf4 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5xLT16:
-		{
-			memcpy( buf0, ( ( int32_t* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( int32_t ) ) );
-			selector1 = _mm512_loadu_epi32( buf0 );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_5xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_5xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_5xLT16:
-	{
-		memcpy( buf0, ( ( float* )post_ops_list_temp->scale_factor +
-					post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-		selector1 = _mm512_loadu_epi32( buf0 );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_1p0,selector1,1,0);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_2p0,selector1,2,0);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_3p0,selector1,3,0);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_4p0,selector1,4,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_5xLT16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( buf0, c_int32_0p0 );
-
-		// c[1,0-15]
-		_mm512_storeu_epi32( buf1, c_int32_1p0 );
-
-		// c[2,0-15]
-		_mm512_storeu_epi32( buf2, c_int32_2p0 );
-
-		// c[3,0-15]
-		_mm512_storeu_epi32( buf3, c_int32_3p0 );
-
-		// c[4,0-15]
-		_mm512_storeu_epi32( buf4, c_int32_4p0 );
-
-		// Memcpy partial parts.
-		// c[0,0-15]
-		memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[1,0-15]
-		memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[2,0-15]
-		memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[3,0-15]
-		memcpy( c + ( rs_c * 3 ) + ( 0*16 ), buf3, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[4,0-15]
-		memcpy( c + ( rs_c * 4 ) + ( 0*16 ), buf4, ( n0_rem * sizeof( int32_t ) ) );
-	}
-}
-
-// 4xlt16 int8o32 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4xLT16_DISABLE,
-						  &&POST_OPS_BIAS_4xLT16,
-						  &&POST_OPS_RELU_4xLT16,
-						  &&POST_OPS_RELU_SCALE_4xLT16,
-						  &&POST_OPS_DOWNSCALE_4xLT16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// For corner cases.
-	int32_t buf0[16];
-	int32_t buf1[16];
-	int32_t buf2[16];
-	int32_t buf3[16];
-
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_2p0 = _mm512_setzero_epi32();
-		
-		__m512i c_int32_3p0 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		
-		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-		
-		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-		
-		
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf3, ( c + ( rs_c * 3 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( buf0 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( buf1 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_epi32( buf2 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_epi32( buf3 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4xLT16:
-		{
-			memcpy( buf0, ( ( int32_t* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( int32_t ) ) );
-			selector1 = _mm512_loadu_epi32( buf0 );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_4xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_4xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_4xLT16:
-	{
-		memcpy( buf0, ( ( float* )post_ops_list_temp->scale_factor +
-					post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-		selector1 = _mm512_loadu_epi32( buf0 );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_1p0,selector1,1,0);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_2p0,selector1,2,0);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_3p0,selector1,3,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_4xLT16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( buf0, c_int32_0p0 );
-
-		// c[1,0-15]
-		_mm512_storeu_epi32( buf1, c_int32_1p0 );
-
-		// c[2,0-15]
-		_mm512_storeu_epi32( buf2, c_int32_2p0 );
-
-		// c[3,0-15]
-		_mm512_storeu_epi32( buf3, c_int32_3p0 );
-
-		// Memcpy partial parts.
-		// c[0,0-15]
-		memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[1,0-15]
-		memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[2,0-15]
-		memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[3,0-15]
-		memcpy( c + ( rs_c * 3 ) + ( 0*16 ), buf3, ( n0_rem * sizeof( int32_t ) ) );
-	}
-}
-
-// 3xlt16 int8o32 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3xLT16_DISABLE,
-						  &&POST_OPS_BIAS_3xLT16,
-						  &&POST_OPS_RELU_3xLT16,
-						  &&POST_OPS_RELU_SCALE_3xLT16,
-						  &&POST_OPS_DOWNSCALE_3xLT16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// For corner cases.
-	int32_t buf0[16];
-	int32_t buf1[16];
-	int32_t buf2[16];
-
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_2p0 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		
-		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-		
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( buf0 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( buf1 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_epi32( buf2 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3xLT16:
-		{
-			memcpy( buf0, ( ( int32_t* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( int32_t ) ) );
-			selector1 = _mm512_loadu_epi32( buf0 );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_3xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_3xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_3xLT16:
-	{
-		memcpy( buf0, ( ( float* )post_ops_list_temp->scale_factor +
-					post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-		selector1 = _mm512_loadu_epi32( buf0 );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_1p0,selector1,1,0);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_2p0,selector1,2,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_3xLT16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( buf0, c_int32_0p0 );
-
-		// c[1,0-15]
-		_mm512_storeu_epi32( buf1, c_int32_1p0 );
-
-		// c[2,0-15]
-		_mm512_storeu_epi32( buf2, c_int32_2p0 );
-
-		// Memcpy partial parts.
-		// c[0,0-15]
-		memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[1,0-15]
-		memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[2,0-15]
-		memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( int32_t ) ) );
-	}
-}
-
-// 2xlt16 int8o32 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2xLT16_DISABLE,
-						  &&POST_OPS_BIAS_2xLT16,
-						  &&POST_OPS_RELU_2xLT16,
-						  &&POST_OPS_RELU_SCALE_2xLT16,
-						  &&POST_OPS_DOWNSCALE_2xLT16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// For corner cases.
-	int32_t buf0[16];
-	int32_t buf1[16];
-
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( buf0 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( buf1 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2xLT16:
-		{
-			memcpy( buf0, ( ( int32_t* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( int32_t ) ) );
-			selector1 = _mm512_loadu_epi32( buf0 );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_2xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_2xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_2xLT16:
-	{
-		memcpy( buf0, ( ( float* )post_ops_list_temp->scale_factor +
-					post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-		selector1 = _mm512_loadu_epi32( buf0 );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_1p0,selector1,1,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_2xLT16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( buf0, c_int32_0p0 );
-
-		// c[1,0-15]
-		_mm512_storeu_epi32( buf1, c_int32_1p0 );
-
-		// Memcpy partial parts.
-		// c[0,0-15]
-		memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[1,0-15]
-		memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( int32_t ) ) );
-	}
-}
-
-// 1xlt16 int8o32 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1xLT16_DISABLE,
-						  &&POST_OPS_BIAS_1xLT16,
-						  &&POST_OPS_RELU_1xLT16,
-						  &&POST_OPS_RELU_SCALE_1xLT16,
-						  &&POST_OPS_DOWNSCALE_1xLT16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// For corner cases.
-	int32_t buf0[16];
-
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-			__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-		
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( int32_t ) ) );
-			
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( buf0 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1xLT16:
-		{
-			memcpy( buf0, ( ( int32_t* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( int32_t ) ) );
-			selector1 = _mm512_loadu_epi32( buf0 );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_1xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_1xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_1xLT16:
-	{
-		memcpy( buf0, ( ( float* )post_ops_list_temp->scale_factor +
-					post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-		selector1 = _mm512_loadu_epi32( buf0 );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_0p0,selector1,0,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_1xLT16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( buf0, c_int32_0p0 );
-
-		// Memcpy partial parts.
-		// c[0,0-15]
-		memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( int32_t ) ) );
-	}
-}
-
-// 5x16 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5x16_DISABLE,
-						  &&POST_OPS_BIAS_5x16,
-						  &&POST_OPS_RELU_5x16,
-						  &&POST_OPS_RELU_SCALE_5x16,
-						  &&POST_OPS_DOWNSCALE_5x16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	
-	__m512i c_int32_3p0 = _mm512_setzero_epi32();
-
-	__m512i c_int32_4p0 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		
-		// Broadcast a[4,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		
-		// Broadcast a[4,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	
-	c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-	
-	c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5x16:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[4,0-15]
-		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_5x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-		// c[4,0-15]
-		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_5x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-		// c[4, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_5x16:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p0,selector1,4,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_5x16_DISABLE:
-	;
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[3,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
-
-	// c[4,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
-}
-
-// 4x16 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4x16_DISABLE,
-						  &&POST_OPS_BIAS_4x16,
-						  &&POST_OPS_RELU_4x16,
-						  &&POST_OPS_RELU_SCALE_4x16,
-						  &&POST_OPS_DOWNSCALE_4x16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	
-	__m512i c_int32_3p0 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	
-	c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4x16:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_4x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_4x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_4x16:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_4x16_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[3,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
-}
-
-// 3x16 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3x16_DISABLE,
-						  &&POST_OPS_BIAS_3x16,
-						  &&POST_OPS_RELU_3x16,
-						  &&POST_OPS_RELU_SCALE_3x16,
-						  &&POST_OPS_DOWNSCALE_3x16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3x16:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_3x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_3x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_3x16:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_3x16_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-}
-
-// 2x16 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2x16_DISABLE,
-						  &&POST_OPS_BIAS_2x16,
-						  &&POST_OPS_RELU_2x16,
-						  &&POST_OPS_RELU_SCALE_2x16,
-						  &&POST_OPS_DOWNSCALE_2x16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2x16:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_2x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_2x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_2x16:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_2x16_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-}
-
-// 1x16 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1x16_DISABLE,
-						  &&POST_OPS_BIAS_1x16,
-						  &&POST_OPS_RELU_1x16,
-						  &&POST_OPS_RELU_SCALE_1x16,
-						  &&POST_OPS_DOWNSCALE_1x16
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		__m512i b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		__m512i a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1x16:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_1x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_1x16:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_1x16:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_1x16_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-}
-
-// 5x32 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5x32_DISABLE,
-						  &&POST_OPS_BIAS_5x32,
-						  &&POST_OPS_RELU_5x32,
-						  &&POST_OPS_RELU_SCALE_5x32,
-						  &&POST_OPS_DOWNSCALE_5x32
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-	
-	__m512i c_int32_3p0 = _mm512_setzero_epi32();
-	__m512i c_int32_3p1 = _mm512_setzero_epi32();
-
-	__m512i c_int32_4p0 = _mm512_setzero_epi32();
-	__m512i c_int32_4p1 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		
-		// Broadcast a[3,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-		
-		// Broadcast a[4,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-		
-		// Broadcast a[4,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-	
-	c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-	c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-	
-	c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-	c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-		// c[4,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p1 = _mm512_add_epi32( selector1, c_int32_4p1 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5x32:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3, 16-31]
-		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
-
-		// c[4,0-15]
-		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-		// c[4, 16-31]
-		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_5x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
-
-		// c[4,0-15]
-		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-		// c[4,16-31]
-		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_5x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
-
-		// c[4, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-		// c[4, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_5x32:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[3, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p0,selector1,4,0);
-
-		// c[4, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p1,selector2,4,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_5x32_DISABLE:
-	;
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-
-	// c[3,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
-
-	// c[4,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
-
-	// c[4,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
-}
-
-// 4x32 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4x32_DISABLE,
-						  &&POST_OPS_BIAS_4x32,
-						  &&POST_OPS_RELU_4x32,
-						  &&POST_OPS_RELU_SCALE_4x32,
-						  &&POST_OPS_DOWNSCALE_4x32
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-	
-	__m512i c_int32_3p0 = _mm512_setzero_epi32();
-	__m512i c_int32_3p1 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		
-		// Broadcast a[3,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-	
-	c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-	c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4x32:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3, 16-31]
-		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_4x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_4x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_4x32:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[3, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_4x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-
-	// c[3,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
-}
-
-// 3x32 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3x32_DISABLE,
-						  &&POST_OPS_BIAS_3x32,
-						  &&POST_OPS_RELU_3x32,
-						  &&POST_OPS_RELU_SCALE_3x32,
-						  &&POST_OPS_DOWNSCALE_3x32
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3x32:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_3x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_3x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_3x32:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_3x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-}
-
-// 2x32 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2x32_DISABLE,
-						  &&POST_OPS_BIAS_2x32,
-						  &&POST_OPS_RELU_2x32,
-						  &&POST_OPS_RELU_SCALE_2x32,
-						  &&POST_OPS_DOWNSCALE_2x32
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2x32:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_2x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_2x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_2x32:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_2x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-}
-
-// 1x32 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1x32_DISABLE,
-						  &&POST_OPS_BIAS_1x32,
-						  &&POST_OPS_RELU_1x32,
-						  &&POST_OPS_RELU_SCALE_1x32,
-						  &&POST_OPS_DOWNSCALE_1x32
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1x32:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_1x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_1x32:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_1x32:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_1x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-}
-
-// 5x48 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5x48_DISABLE,
-						  &&POST_OPS_BIAS_5x48,
-						  &&POST_OPS_RELU_5x48,
-						  &&POST_OPS_RELU_SCALE_5x48,
-						  &&POST_OPS_DOWNSCALE_5x48
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-	__m512i c_int32_1p2 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-	__m512i c_int32_2p2 = _mm512_setzero_epi32();
-	
-	__m512i c_int32_3p0 = _mm512_setzero_epi32();
-	__m512i c_int32_3p1 = _mm512_setzero_epi32();
-	__m512i c_int32_3p2 = _mm512_setzero_epi32();
-
-	__m512i c_int32_4p0 = _mm512_setzero_epi32();
-	__m512i c_int32_4p1 = _mm512_setzero_epi32();
-	__m512i c_int32_4p2 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		
-		// Broadcast a[3,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
-		
-		// Broadcast a[4,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
-		
-		// Broadcast a[4,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-	c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
-	
-	c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-	c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-	c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
-	
-	c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-	c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
-	c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p2 = _mm512_add_epi32( selector1, c_int32_2p2 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
-
-		// c[3,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p2 = _mm512_add_epi32( selector1, c_int32_3p2 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-		// c[4,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p1 = _mm512_add_epi32( selector1, c_int32_4p1 );
-
-		// c[4,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 4 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_4p2 = _mm512_add_epi32( selector1, c_int32_4p2 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5x48:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3, 16-31]
-		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
-
-		// c[3,32-47]
-		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
-
-		// c[4,0-15]
-		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-		// c[4, 16-31]
-		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
-
-		// c[4,32-47]
-		c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_5x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
-
-		// c[3,32-47]
-		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
-
-		// c[4,0-15]
-		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-		// c[4,16-31]
-		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
-
-		// c[4,32-47]
-		c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_5x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
-
-		// c[3, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
-
-		// c[4, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-		// c[4, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
-
-		// c[4, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_5x48:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[2, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p2,a_int32_0,2,2);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[3, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
-
-		// c[3, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p2,a_int32_0,3,2);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p0,selector1,4,0);
-
-		// c[4, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p1,selector2,4,1);
-
-		// c[4, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p2,a_int32_0,4,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_5x48_DISABLE:
-	;
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
-
-	// c[3,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
-
-	// c[3,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
-
-	// c[4,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
-
-	// c[4,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
-
-	// c[4,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 );
-}
-
-// 4x48 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4x48_DISABLE,
-						  &&POST_OPS_BIAS_4x48,
-						  &&POST_OPS_RELU_4x48,
-						  &&POST_OPS_RELU_SCALE_4x48,
-						  &&POST_OPS_DOWNSCALE_4x48
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-	__m512i c_int32_1p2 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-	__m512i c_int32_2p2 = _mm512_setzero_epi32();
-	
-	__m512i c_int32_3p0 = _mm512_setzero_epi32();
-	__m512i c_int32_3p1 = _mm512_setzero_epi32();
-	__m512i c_int32_3p2 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		
-		// Broadcast a[3,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-	c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
-	
-	c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-	c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-	c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p2 = _mm512_add_epi32( selector1, c_int32_2p2 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
-
-		// c[3,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 3 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_3p2 = _mm512_add_epi32( selector1, c_int32_3p2 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4x48:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-		// c[3, 16-31]
-		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
-
-		// c[3,32-47]
-		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_4x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
-
-		// c[3,0-15]
-		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-		// c[3,16-31]
-		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
-
-		// c[3,32-47]
-		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_4x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
-
-		// c[3, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_4x48:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[2, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p2,a_int32_0,2,2);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[3, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
-
-		// c[3, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p2,a_int32_0,3,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_4x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
-
-	// c[3,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
-
-	// c[3,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
-}
-
-// 3x48 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3x48_DISABLE,
-						  &&POST_OPS_BIAS_3x48,
-						  &&POST_OPS_RELU_3x48,
-						  &&POST_OPS_RELU_SCALE_3x48,
-						  &&POST_OPS_DOWNSCALE_3x48
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-	__m512i c_int32_1p2 = _mm512_setzero_epi32();
-
-	__m512i c_int32_2p0 = _mm512_setzero_epi32();
-	__m512i c_int32_2p1 = _mm512_setzero_epi32();
-	__m512i c_int32_2p2 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-		
-		// Broadcast a[2,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-	
-	c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-	c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-	c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_2p2 = _mm512_add_epi32( selector1, c_int32_2p2 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3x48:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-		// c[2, 16-31]
-		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_3x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-		// c[2,0-15]
-		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-		// c[2,16-31]
-		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-		// c[2,32-47]
-		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_3x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_3x48:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[2, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p2,a_int32_0,2,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_3x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
-
-	// c[2,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
-}
-
-// 2x48 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2x48_DISABLE,
-						  &&POST_OPS_BIAS_2x48,
-						  &&POST_OPS_RELU_2x48,
-						  &&POST_OPS_RELU_SCALE_2x48,
-						  &&POST_OPS_DOWNSCALE_2x48
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-
-	__m512i c_int32_1p0 = _mm512_setzero_epi32();
-	__m512i c_int32_1p1 = _mm512_setzero_epi32();
-	__m512i c_int32_1p2 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		
-		// Broadcast a[1,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-		
-		// Broadcast a[1,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-
-	c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-	c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-	c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2x48:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-		// c[1, 16-31]
-		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_2x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		// c[1,0-15]
-		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-		// c[1,16-31]
-		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-		// c[1,32-47]
-		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_2x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_2x48:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_2x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-
-	// c[1,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
-}
-
-// 1x48 int8o32 kernel
-LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1x48_DISABLE,
-						  &&POST_OPS_BIAS_1x48,
-						  &&POST_OPS_RELU_1x48,
-						  &&POST_OPS_RELU_SCALE_1x48,
-						  &&POST_OPS_DOWNSCALE_1x48
-						};
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// Registers to use for accumulating C.
-	__m512i c_int32_0p0 = _mm512_setzero_epi32();
-	__m512i c_int32_0p1 = _mm512_setzero_epi32();
-	__m512i c_int32_0p2 = _mm512_setzero_epi32();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( uint8_t ) ) );
-		a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 4.
-		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-	}
-
-	// Load alpha and beta
-	__m512i selector1 = _mm512_set1_epi32( alpha );
-	__m512i selector2 = _mm512_set1_epi32( beta );
-
-	// Scale by alpha
-	c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-	c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-	c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_epi32( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mullo_epi32( selector2, selector1 );
-		c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-	}
-
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1x48:
-	{
-		selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_1x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-
-		// c[0,0-15]
-		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-		// c[0, 16-31]
-		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-		// c[0,32-47]
-		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_1x48:
-	{
-		selector1 = _mm512_setzero_epi32();
-		selector2 =
-			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_1x48:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_1x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_epi32( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
-}
-#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_n_fringe_amd512vnni.c b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_n_fringe_amd512vnni.c
deleted file mode 100644
index 856dc1355e..0000000000
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_n_fringe_amd512vnni.c
+++ /dev/null
@@ -1,2300 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <immintrin.h>
-#include <string.h>
-
-#include "blis.h"
-#include "lpgemm_kernels.h"
-#include "lpgemm_s32_kern_macros.h"
-
-#ifdef BLIS_KERNELS_ZEN4
-// 6xlt16 int8o32 fringe kernel
-LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6xLT16_DISABLE,
-						  &&POST_OPS_BIAS_6xLT16,
-						  &&POST_OPS_RELU_6xLT16,
-						  &&POST_OPS_RELU_SCALE_6xLT16,
-						  &&POST_OPS_DOWNSCALE_6xLT16
-						};
-	dim_t MR = 6;
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	// For corner cases.
-	int32_t buf0[16];
-	int32_t buf1[16];
-	int32_t buf2[16];
-	int32_t buf3[16];
-	int32_t buf4[16];
-	int32_t buf5[16];
-
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_2p0 = _mm512_setzero_epi32();
-		
-		__m512i c_int32_3p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_4p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_5p0 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			// Load 4 rows with 16 extended elements each from B to 1 ZMM
-			// registers. It is to be noted that the B matrix is packed for use
-			// in vnni instructions and each load to ZMM register will have 4
-			// elements along k direction and 16 elements across n directions,
-			// so 4x16 elements to a ZMM register.
-			b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			
-			// Broadcast a[4,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			
-			// Broadcast a[5,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			
-			// Broadcast a[4,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			
-			// Broadcast a[5,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		
-		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-		
-		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-		
-		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-		
-		c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			memcpy( buf0, ( c + ( rs_c * ( ir + 0 ) ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf1, ( c + ( rs_c * ( ir + 1 ) ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf2, ( c + ( rs_c * ( ir + 2 ) ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf3, ( c + ( rs_c * ( ir + 3 ) ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf4, ( c + ( rs_c * ( ir + 4 ) ) ), ( n0_rem * sizeof( int32_t ) ) );
-			memcpy( buf5, ( c + ( rs_c * ( ir + 5 ) ) ), ( n0_rem * sizeof( int32_t ) ) );
-			
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( buf0 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( buf1 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_epi32( buf2 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_epi32( buf3 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_epi32( buf4 );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_epi32( buf5  );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6xLT16:
-		{
-			memcpy( buf0, ( ( int32_t* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( int32_t ) ) );
-			selector1 = _mm512_loadu_epi32( buf0 );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			// c[5,0-15]
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-			// c[5,0-15]
-			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6xLT16:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_6xLT16:
-	{
-		memcpy( buf0, ( ( float* )post_ops_list_temp->scale_factor +
-					post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-		selector1 = _mm512_loadu_epi32( buf0 );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_1p0,selector1,1,0);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_2p0,selector1,2,0);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_3p0,selector1,3,0);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_4p0,selector1,4,0);
-
-		// c[5, 0-15]
-		CVT_MULRND_CVT32_CVT8_LT16(c_int32_5p0,selector1,5,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_6xLT16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( buf0, c_int32_0p0 );
-
-		// c[1,0-15]
-		_mm512_storeu_epi32( buf1, c_int32_1p0 );
-
-		// c[2,0-15]
-		_mm512_storeu_epi32( buf2, c_int32_2p0 );
-
-		// c[3,0-15]
-		_mm512_storeu_epi32( buf3, c_int32_3p0 );
-
-		// c[4,0-15]
-		_mm512_storeu_epi32( buf4, c_int32_4p0 );
-
-		// c[5,0-15]
-		_mm512_storeu_epi32( buf5, c_int32_5p0 );
-
-		// Memcpy partial parts.
-		// c[0,0-15]
-		memcpy( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), buf0, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[1,0-15]
-		memcpy( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), buf1, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[2,0-15]
-		memcpy( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), buf2, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[3,0-15]
-		memcpy( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), buf3, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[4,0-15]
-		memcpy( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), buf4, ( n0_rem * sizeof( int32_t ) ) );
-
-		// c[5,0-15]
-		memcpy( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), buf5, ( n0_rem * sizeof( int32_t ) ) );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-	}
-
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_u8s8s32o32_5xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 4 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_u8s8s32o32_4xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 3 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_u8s8s32o32_3xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 2 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_u8s8s32o32_2xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 1 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_u8s8s32o32_1xlt16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-	}
-}
-
-// 6x16 int8o32 fringe kernel
-LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6x16_DISABLE,
-						  &&POST_OPS_BIAS_6x16,
-						  &&POST_OPS_RELU_6x16,
-						  &&POST_OPS_RELU_SCALE_6x16,
-						  &&POST_OPS_DOWNSCALE_6x16
-						};
-	dim_t MR = 6;
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_1p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_2p0 = _mm512_setzero_epi32();
-		
-		__m512i c_int32_3p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_4p0 = _mm512_setzero_epi32();
-
-		__m512i c_int32_5p0 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			// Load 4 rows with 16 elements each from B to 1 ZMM registers. It
-			// is to be noted that the B matrix is packed for use in vnni
-			// instructions and each load to ZMM register will have 4 elements
-			// along k direction and 16 elements across n directions, so 4x16
-			// elements to a ZMM register.
-			b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			
-			// Broadcast a[4,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			
-			// Broadcast a[5,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			
-			// Broadcast a[1,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			
-			// Broadcast a[4,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			
-			// Broadcast a[5,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
-			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		
-		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-		
-		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-		
-		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-		
-		c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6x16:
-		{
-			selector1 =
-					_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-									post_op_c_j );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			// c[5,0-15]
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6x16:
-		{
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-			// c[5,0-15]
-			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6x16:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_6x16:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p0,selector1,4,0);
-
-		// c[5, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_5p0,selector1,5,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_6x16_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
-
-		// c[1,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
-
-		// c[2,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
-
-		// c[3,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
-
-		// c[4,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
-
-		// c[5,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-	}
-
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_u8s8s32o32_5x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 4 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_u8s8s32o32_4x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 3 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_u8s8s32o32_3x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 2 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_u8s8s32o32_2x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 1 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_u8s8s32o32_1x16
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-	}
-}
-
-// 6x32 int8o32 fringe kernel
-LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6x32_DISABLE,
-						  &&POST_OPS_BIAS_6x32,
-						  &&POST_OPS_RELU_6x32,
-						  &&POST_OPS_RELU_SCALE_6x32,
-						  &&POST_OPS_DOWNSCALE_6x32
-						};
-	dim_t MR = 6;
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-		__m512i c_int32_0p1 = _mm512_setzero_epi32();
-
-		__m512i c_int32_1p0 = _mm512_setzero_epi32();
-		__m512i c_int32_1p1 = _mm512_setzero_epi32();
-
-		__m512i c_int32_2p0 = _mm512_setzero_epi32();
-		__m512i c_int32_2p1 = _mm512_setzero_epi32();
-		
-		__m512i c_int32_3p0 = _mm512_setzero_epi32();
-		__m512i c_int32_3p1 = _mm512_setzero_epi32();
-
-		__m512i c_int32_4p0 = _mm512_setzero_epi32();
-		__m512i c_int32_4p1 = _mm512_setzero_epi32();
-
-		__m512i c_int32_5p0 = _mm512_setzero_epi32();
-		__m512i c_int32_5p1 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			// Load 4 rows with 32 elements each from B to 2 ZMM registers. It
-			// is to be noted that the B matrix is packed for use in vnni
-			// instructions and each load to ZMM register will have 4 elements
-			// along k direction and 16 elements across n directions, so 4x16
-			// elements to a ZMM register.
-			b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-			b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-			
-			// Broadcast a[1,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-			
-			// Broadcast a[2,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-			
-			// Broadcast a[3,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-			
-			// Broadcast a[4,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-			
-			// Broadcast a[5,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[5,0-31] = a[5,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
-			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-			b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-			
-			// Broadcast a[1,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-			
-			// Broadcast a[2,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-			
-			// Broadcast a[3,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-			
-			// Broadcast a[4,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-			
-			// Broadcast a[5,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[5,0-31] = a[5,kr:kr+4]*b[kr:kr+4,0-31]
-			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
-			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-		
-		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-		
-		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-		
-		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
-		
-		c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
-		c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[0, 16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[1,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[2,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[3,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			// c[4,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p1 = _mm512_add_epi32( selector1, c_int32_4p1 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-
-			// c[5,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p1 = _mm512_add_epi32( selector1, c_int32_5p1 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6x32:
-		{
-			selector1 =
-					_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-					_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 1 * 16 ) );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[0, 16-31]
-			c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[1, 16-31]
-			c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[2, 16-31]
-			c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[3, 16-31]
-			c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			// c[4, 16-31]
-			c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
-
-			// c[5,0-15]
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-
-			// c[5, 16-31]
-			c_int32_5p1 = _mm512_add_epi32( selector2, c_int32_5p1 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6x32:
-		{
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			// c[0, 16-31]
-			c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-			// c[1,16-31]
-			c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-			// c[2,16-31]
-			c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-			// c[3,16-31]
-			c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-			// c[4,16-31]
-			c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
-
-			// c[5,0-15]
-			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
-
-			// c[5,16-31]
-			c_int32_5p1 = _mm512_max_epi32( selector1, c_int32_5p1 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6x32:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			// c[0, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-			// c[1, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-			// c[2, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-			// c[3, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-			// c[4, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
-
-			// c[5, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_5p1)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_6x32:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[3, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p0,selector1,4,0);
-
-		// c[4, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p1,selector2,4,1);
-
-		// c[5, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_5p0,selector1,5,0);
-
-		// c[5, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_5p1,selector2,5,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_6x32_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
-
-		// c[0, 16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
-
-		// c[1,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
-
-		// c[1,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
-
-		// c[2,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
-
-		// c[2,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
-
-		// c[3,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
-
-		// c[3,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
-
-		// c[4,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
-
-		// c[4,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
-
-		// c[5,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
-
-		// c[5,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-	}
-
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_u8s8s32o32_5x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 4 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_u8s8s32o32_4x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 3 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_u8s8s32o32_3x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 2 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_u8s8s32o32_2x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 1 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_u8s8s32o32_1x32
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-	}
-}
-
-// 6x48 int8o32 fringe kernel
-LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6x48_DISABLE,
-						  &&POST_OPS_BIAS_6x48,
-						  &&POST_OPS_RELU_6x48,
-						  &&POST_OPS_RELU_SCALE_6x48,
-						  &&POST_OPS_DOWNSCALE_6x48
-						};
-	dim_t MR = 6;
-	dim_t m_full_pieces = m0 / MR;
-	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
-	dim_t m_partial_pieces = m0 % MR;
-
-	dim_t k_full_pieces = k0 / 4;
-	dim_t k_partial_pieces = k0 % 4;
-
-	uint32_t a_kfringe_buf = 0;
-
-	// B matrix storage.
-	__m512i b0;
-	__m512i b1;
-	__m512i b2;
-
-	// A matrix storage.
-	__m512i a_int32_0;
-
-	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
-	{
-		// Registers to use for accumulating C.
-		__m512i c_int32_0p0 = _mm512_setzero_epi32();
-		__m512i c_int32_0p1 = _mm512_setzero_epi32();
-		__m512i c_int32_0p2 = _mm512_setzero_epi32();
-
-		__m512i c_int32_1p0 = _mm512_setzero_epi32();
-		__m512i c_int32_1p1 = _mm512_setzero_epi32();
-		__m512i c_int32_1p2 = _mm512_setzero_epi32();
-
-		__m512i c_int32_2p0 = _mm512_setzero_epi32();
-		__m512i c_int32_2p1 = _mm512_setzero_epi32();
-		__m512i c_int32_2p2 = _mm512_setzero_epi32();
-		
-		__m512i c_int32_3p0 = _mm512_setzero_epi32();
-		__m512i c_int32_3p1 = _mm512_setzero_epi32();
-		__m512i c_int32_3p2 = _mm512_setzero_epi32();
-
-		__m512i c_int32_4p0 = _mm512_setzero_epi32();
-		__m512i c_int32_4p1 = _mm512_setzero_epi32();
-		__m512i c_int32_4p2 = _mm512_setzero_epi32();
-
-		__m512i c_int32_5p0 = _mm512_setzero_epi32();
-		__m512i c_int32_5p1 = _mm512_setzero_epi32();
-		__m512i c_int32_5p2 = _mm512_setzero_epi32();
-
-		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-		{
-			// Load 4 rows with 48 elements each from B to 3 ZMM registers. It
-			// is to be noted that the B matrix is packed for use in vnni
-			// instructions and each load to ZMM register will have 4 elements
-			// along k direction and 16 elements across n directions, so 4x16
-			// elements to a ZMM register.
-			b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-			b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-			b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-			
-			// Broadcast a[1,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-			
-			// Broadcast a[2,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-			
-			// Broadcast a[3,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
-			
-			// Broadcast a[4,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
-			
-			// Broadcast a[5,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[5,0-47] = a[5,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
-			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
-			c_int32_5p2 = _mm512_dpbusd_epi32( c_int32_5p2, a_int32_0, b2 );
-		}
-		// Handle k remainder.
-		if ( k_partial_pieces > 0 )
-		{
-			b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-			b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-			b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-			// Broadcast a[0,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
-			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
-			
-			// Broadcast a[1,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
-			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
-			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
-			
-			// Broadcast a[2,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-			
-			// Broadcast a[3,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
-			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
-			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
-			
-			// Broadcast a[4,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
-			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
-			
-			// Broadcast a[5,kr:kr+4].
-			memcpy
-			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
-			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-
-			// Perform column direction mat-mul with k = 4.
-			// c[5,0-47] = a[5,kr:kr+4]*b[kr:kr+4,0-47]
-			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
-			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
-			c_int32_5p2 = _mm512_dpbusd_epi32( c_int32_5p2, a_int32_0, b2 );
-		}
-
-		// Load alpha and beta
-		__m512i selector1 = _mm512_set1_epi32( alpha );
-		__m512i selector2 = _mm512_set1_epi32( beta );
-
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-		
-		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
-		
-		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
-		
-		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
-		c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
-		
-		c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
-		c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
-		c_int32_5p2 = _mm512_mullo_epi32( selector1, c_int32_5p2 );
-
-		// Scale C by beta.
-		if ( beta != 0 )
-		{
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[0, 16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-			// c[0,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[1,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-			// c[1,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-			// c[2,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[2,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
-
-			// c[2,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p2 = _mm512_add_epi32( selector1, c_int32_2p2 );
-
-			// c[3,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[3,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
-
-			// c[3,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p2 = _mm512_add_epi32( selector1, c_int32_3p2 );
-
-			// c[4,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			// c[4,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p1 = _mm512_add_epi32( selector1, c_int32_4p1 );
-
-			// c[4,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p2 = _mm512_add_epi32( selector1, c_int32_4p2 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-
-			// c[5,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p1 = _mm512_add_epi32( selector1, c_int32_5p1 );
-
-			// c[5,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p2 = _mm512_add_epi32( selector1, c_int32_5p2 );
-		}
-
-        // Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_6x48:
-		{
-			selector1 =
-					_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-					_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 1 * 16 ) );
-			a_int32_0 =
-					_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-									post_op_c_j + ( 2 * 16 ) );
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[0, 16-31]
-			c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
-
-			// c[0,32-47]
-			c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[1, 16-31]
-			c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
-
-			// c[1,32-47]
-			c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
-
-			// c[2, 16-31]
-			c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
-
-			// c[2,32-47]
-			c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
-
-			// c[3, 16-31]
-			c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
-
-			// c[3,32-47]
-			c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
-
-			// c[4, 16-31]
-			c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
-
-			// c[4,32-47]
-			c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
-
-			// c[5,0-15]
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-
-			// c[5, 16-31]
-			c_int32_5p1 = _mm512_add_epi32( selector2, c_int32_5p1 );
-
-			// c[5,32-47]
-			c_int32_5p2 = _mm512_add_epi32( a_int32_0, c_int32_5p2 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_6x48:
-		{
-			//printf("relu\n");
-			selector1 = _mm512_setzero_epi32();
-
-			// c[0,0-15]
-			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
-
-			// c[0, 16-31]
-			c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
-
-			// c[0,32-47]
-			c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
-
-			// c[1,0-15]
-			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
-
-			// c[1,16-31]
-			c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
-
-			// c[1,32-47]
-			c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
-
-			// c[2,0-15]
-			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
-
-			// c[2,16-31]
-			c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
-
-			// c[2,32-47]
-			c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
-
-			// c[3,0-15]
-			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
-
-			// c[3,16-31]
-			c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
-
-			// c[3,32-47]
-			c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
-
-			// c[4,0-15]
-			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
-
-			// c[4,16-31]
-			c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
-
-			// c[4,32-47]
-			c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
-
-			// c[5,0-15]
-			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
-
-			// c[5,16-31]
-			c_int32_5p1 = _mm512_max_epi32( selector1, c_int32_5p1 );
-
-			// c[5,32-47]
-			c_int32_5p2 = _mm512_max_epi32( selector1, c_int32_5p2 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_6x48:
-		{
-			selector1 = _mm512_setzero_epi32();
-			selector2 =
-				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
-
-			// c[0, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
-
-			// c[0, 32-47]
-			RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
-
-			// c[1, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
-
-			// c[1, 32-47]
-			RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
-
-			// c[2, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
-
-			// c[2, 32-47]
-			RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
-
-			// c[3, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
-
-			// c[3, 32-47]
-			RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
-
-			// c[4, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
-
-			// c[4, 32-47]
-			RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
-
-			// c[5, 0-15]
-			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
-
-			// c[5, 16-31]
-			RELU_SCALE_OP_S32_AVX512(c_int32_5p1)
-
-			// c[5, 32-47]
-			RELU_SCALE_OP_S32_AVX512(c_int32_5p2)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_6x48:
-	{
-		selector1 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 0 * 16 ) );
-		selector2 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 1 * 16 ) );
-		a_int32_0 =
-			_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-							post_op_c_j + ( 2 * 16 ) );
-
-		// c[0, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
-
-		// c[0, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
-
-		// c[0, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
-
-		// c[1, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
-
-		// c[1, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
-
-		// c[1, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
-
-		// c[2, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
-
-		// c[2, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
-
-		// c[2, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_2p2,a_int32_0,2,2);
-
-		// c[3, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
-
-		// c[3, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
-
-		// c[3, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_3p2,a_int32_0,3,2);
-
-		// c[4, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p0,selector1,4,0);
-
-		// c[4, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p1,selector2,4,1);
-
-		// c[4, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_4p2,a_int32_0,4,2);
-
-		// c[5, 0-15]
-		CVT_MULRND_CVT32_CVT8(c_int32_5p0,selector1,5,0);
-
-		// c[5, 16-31]
-		CVT_MULRND_CVT32_CVT8(c_int32_5p1,selector2,5,1);
-
-		// c[5, 32-47]
-		CVT_MULRND_CVT32_CVT8(c_int32_5p2,a_int32_0,5,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_6x48_DISABLE:
-		;
-		
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
-
-		// c[0, 16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
-
-		// c[0,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 );
-
-		// c[1,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
-
-		// c[1,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
-
-		// c[1,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 );
-
-		// c[2,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
-
-		// c[2,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
-
-		// c[2,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 );
-
-		// c[3,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
-
-		// c[3,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
-
-		// c[3,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 );
-
-		// c[4,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
-
-		// c[4,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
-
-		// c[4,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 );
-
-		// c[5,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
-
-		// c[5,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
-
-		// c[5,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 );
-
-		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
-	}
-
-	if ( m_partial_pieces > 0 )
-	{
-		if ( m_partial_pieces == 5 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
-			lpgemm_rowvar_u8s8s32o32_5x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 4 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
-			lpgemm_rowvar_u8s8s32o32_4x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 3 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
-			lpgemm_rowvar_u8s8s32o32_3x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 2 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
-			lpgemm_rowvar_u8s8s32o32_2x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-		else if ( m_partial_pieces == 1 )
-		{
-			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
-			lpgemm_rowvar_u8s8s32o32_1x48
-			(
-			  k0,
-			  a, rs_a, cs_a_use,
-			  b, rs_b, cs_b,
-			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
-			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
-			);
-		}
-	}
-}
-#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa.h b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa.h
index b983b0c617..9b1c55046e 100644
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa.h
+++ b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,14 +35,19 @@
 #ifndef BLIS_GEMM_INT8_PACKA
 #define BLIS_GEMM_INT8_PACKA
 
-void get_packa_k64_u8s8s32o32_strides
+typedef void (*packa_s32)
      (
-       dim_t* rs_a,
-       dim_t* cs_a
+       uint8_t*,
+       const uint8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       dim_t*,
+       dim_t*
      );
 
 void packa_k64_u8s8s32o32
-     ( 
+     (
        uint8_t*       pack_a_buffer_u8s8s32o32,
        const uint8_t* a,
        const dim_t    lda,
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb.h b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb.h
index 3f310c0a48..1d69148e3c 100644
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb.h
+++ b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,14 +45,19 @@ BLIS_INLINE dim_t get_packb_u8s8s32o32_min_NR()
 	return 16;
 }
 
-void get_packb_nr64_u8s8s32o32_strides
+typedef void (*packb_s32)
      (
-       dim_t* rs_b,
-       dim_t* cs_b
+       int8_t*,
+       const int8_t*,
+       const dim_t,
+       const dim_t,
+       const dim_t,
+       dim_t*,
+       dim_t*
      );
 
 void packb_nr64_u8s8s32o32
-     ( 
+     (
        int8_t*       pack_b_buffer_u8s8s32o32,
        const int8_t* b,
        const dim_t   ldb,
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_s32_kern_macros.h b/addon/aocl_gemm/kernels/u8s8s32/lpgemm_s32_kern_macros.h
deleted file mode 100644
index bc3546736c..0000000000
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_s32_kern_macros.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-	- Redistributions of source code must retain the above copyright
-	  notice, this list of conditions and the following disclaimer.
-	- Redistributions in binary form must reproduce the above copyright
-	  notice, this list of conditions and the following disclaimer in the
-	  documentation and/or other materials provided with the distribution.
-	- Neither the name(s) of the copyright holder(s) nor the names of its
-	  contributors may be used to endorse or promote products derived
-	  from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef LPGEMM_S32_KERN_MACROS_H
-#define LPGEMM_S32_KERN_MACROS_H
-#define S8_MIN  (-128)
-#define S8_MAX  (+127)
-
-#define RELU_SCALE_OP_S32_AVX512(reg) \
-	/* Generate indenx of elements <= 0.*/ \
-	relu_cmp_mask = _mm512_cmple_epi32_mask( reg, selector1 ); \
- \
-	/* Apply scaling on for <= 0 elements.*/ \
-	reg = _mm512_mask_mullo_epi32( reg, relu_cmp_mask, reg, selector2 ); \
-
-#define CVT_MULRND_CVT32_CVT8(reg,selector,m_ind,n_ind) \
-	_mm_storeu_epi8 \
-	( \
-	  ( int8_t* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + m_ind ) ) + post_op_c_j + ( n_ind * 16 ), \
-	  _mm512_cvtepi32_epi8 \
-	  ( \
-		_mm512_cvtps_epi32 \
-		( \
-		  _mm512_min_ps \
-		  ( \
-			_mm512_max_ps \
-			( \
-			  _mm512_mul_round_ps \
-			  ( \
-				_mm512_cvtepi32_ps( reg ), \
-				( __m512 )selector, \
-				( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) \
-			  ) \
-			  , _mm512_set1_ps (( float )S8_MIN) \
-			) \
-			, _mm512_set1_ps (( float )S8_MAX) \
-		  ) \
-		) \
-	  ) \
-	) \
-
-#define CVT_MULRND_CVT32_CVT8_LT16(reg,selector,m_ind,n_ind) \
-	_mm_storeu_epi8 \
-	( \
-	  buf0, \
-	  _mm512_cvtepi32_epi8 \
-	  ( \
-		_mm512_cvtps_epi32 \
-		( \
-		  _mm512_min_ps \
-		  ( \
-			_mm512_max_ps \
-			( \
-			  _mm512_mul_round_ps \
-			  ( \
-				_mm512_cvtepi32_ps( reg ), \
-				( __m512 )selector, \
-				( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) \
-			  ) \
-			  , _mm512_set1_ps (( float )S8_MIN) \
-			) \
-			, _mm512_set1_ps (( float )S8_MAX) \
-		  ) \
-		) \
-	  ) \
-	); \
-	memcpy( ( int8_t* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + m_ind ) ) + post_op_c_j + \
-	  ( n_ind * 16 ) , buf0, ( n0_rem * sizeof( int8_t ) ) ); \
-
-#endif // LPGEMM_S32_KERN_MACROS_H
diff --git a/aocl_dtl/aocldtl.c b/aocl_dtl/aocldtl.c
index 6e7ee35102..a9b3db1786 100644
--- a/aocl_dtl/aocldtl.c
+++ b/aocl_dtl/aocldtl.c
@@ -5,7 +5,7 @@
  *               These functions are invoked though macros by
  *               end user.
  *
- * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  *=======================================================================*/
 #include "blis.h"
@@ -129,7 +129,7 @@ void DTL_Initialize(
 
 #if (AOCL_DTL_LOG_ENABLE || AOCL_DTL_DUMP_ENABLE)
     
-    /* Check if DTL logging is requested via envoronment variable */ 
+    /* Check if DTL logging is requested via environment variable */
     gbIsLoggingEnabled = bli_env_get_var( "AOCL_VERBOSE", TRUE );
 #endif
 
diff --git a/aocl_dtl/aocldtl.h b/aocl_dtl/aocldtl.h
index f520518e9c..7f9934ed24 100644
--- a/aocl_dtl/aocldtl.h
+++ b/aocl_dtl/aocldtl.h
@@ -1,195 +1,195 @@
-/*===================================================================
- * File Name :  aocldtl.h
- *
- * Description : This is main interface file for the end user
- *               It provides defination for all macros to be
- *               used by user to add debug/trace information.
- *
- * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
- *
- *==================================================================*/
-
-#ifndef _AOCLDTL_H_
-#define _AOCLDTL_H_
-
-#include "aocldtlcf.h"
-#include "aocltpdef.h"
-#include "aoclflist.h"
-#include "aoclos.h"
-
-#define TRACE_TYPE_FENTRY           (1)
-#define TRACE_TYPE_FEXIT            (2)
-#define TRACE_TYPE_LOG              (3)
-#define TRACE_TYPE_RAW              (4)
-
-/* Type definition for printf */
-#define AOCL_DEBUGPRINT printf
-
-/* Define the AOCL_DTL_INITIALIZE_ENABLE if any of the debug macro
- * are defined */
-#if (AOCL_DTL_TRACE_ENABLE || AOCL_DTL_DUMP_ENABLE || AOCL_DTL_LOG_ENABLE)
-#define AOCL_DTL_INITIALIZE_ENABLE
-#endif
-
-#if AOCL_DTL_TRACE_ENABLE
-/* Entry macro to trace the flow of control The parameter LogLevel specifies
-      the log level String will preferably contains the function name in which
-      this macro is invoked */
-#define AOCL_DTL_TRACE_ENTRY(LogLevel) \
-    DTL_Trace(LogLevel,                \
-              TRACE_TYPE_FENTRY,       \
-              __FILE__,                \
-              __FUNCTION__,            \
-              __LINE__,                \
-              NULL);
-#else
-/* Dummy macro definition if the AOCL_DTL_TRACE_ENABLE macro is not enabled */
-#define AOCL_DTL_TRACE_ENTRY(LogLevel)
-#endif
-
-#if AOCL_DTL_TRACE_ENABLE
-/* Exit macro to trace the flow of control The parameter LogLevel specifies
-      log level String will preferably contains the function name in which this
-      macro is invoked */
-#define AOCL_DTL_TRACE_EXIT(LogLevel) \
-    DTL_Trace(LogLevel,               \
-              TRACE_TYPE_FEXIT,       \
-              __FILE__,               \
-              __FUNCTION__,           \
-              __LINE__,               \
-              NULL);
-
-#define AOCL_DTL_TRACE_EXIT_ERR(LogLevel, Message) \
-    DTL_Trace(LogLevel,                            \
-              TRACE_TYPE_FEXIT,                    \
-              __FILE__,                            \
-              __FUNCTION__,                        \
-              __LINE__,                            \
-              Message);
-#else
-/* Dummy macro definition if the AOCL_DTL_TRACE_ENABLE macro is not enabled */
-#define AOCL_DTL_TRACE_EXIT(LogLevel)
-#define AOCL_DTL_TRACE_EXIT_ERR(LogLevel, Message)
-#endif
-
-#if AOCL_DTL_DUMP_ENABLE
-/* Macro to Dump the DATA The parameters  Buffer contains the data to be
-      dumped BufferSize specifies the no. of bytes to be dumped DataType
-      specifies the data type of Buffer */
-#define AOCL_DTL_DUMP(LogLevel, Buffer, BufferSize, DataType, String, OutputType) \
-    /* Call the Dump function to Dump the DATA */                                 \
-    DTL_DumpData(LogLevel,                                                        \
-                 Buffer,                                                          \
-                 BufferSize,                                                      \
-                 DataType,                                                        \
-                 String,                                                          \
-                 OutputType);
-#else
-/* Dummy macro definition if the AOCL_DTL_DUMP_ENABLE macro is not enabled */
-#define AOCL_DTL_DUMP(Buffer, BufferSize, DataType, String, OutputType)
-
-#endif
-
-#if AOCL_DTL_LOG_ENABLE
-/* Macro to log the Data */
-#define AOCL_DTL_LOG(LogLevel, Message) \
-    DTL_Trace(LogLevel,                 \
-              TRACE_TYPE_LOG,           \
-              __FILE__,                 \
-              __FUNCTION__,             \
-              __LINE__,                 \
-              Message);
-#else
-/* Dummy macro definition if the AOCL_DTL_LOG_ENABLE macro is not enabled */
-#define AOCL_DTL_LOG(LogLevel, Message)
-#endif
-
-#if AOCL_DTL_LOG_ENABLE
-
-void AOCL_DTL_start_perf_timer(void);
-uint64 AOCL_DTL_get_time_spent(void);
-
-/*
- * Logging of inputs can be enabled by two methods:
- *
- * 1. Using environment variable AOCL_VERBOSE.
- * 2. APIs
- * 
- * The API takes precedence over environment variable.
- * 
- * The global flag is maintain in the code to track the final
- * state of the logging feature.
- */
-extern Bool gbIsLoggingEnabled;
-
-/* API to enable logging at runtime */
-#define AOCL_DTL_Enable_Logs() \
-    /* Initialize DTL if not alredy done so */ \
-    AOCL_DTL_INITIALIZE(AOCL_DTL_TRACE_LEVEL); \
-    gbIsLoggingEnabled = TRUE;
-
-/* API to disable logging at runtime */
-#define AOCL_DTL_Disable_Logs() \
-    /* Initialize DTL if not alredy done so */ \
-    AOCL_DTL_INITIALIZE(AOCL_DTL_TRACE_LEVEL); \
-    gbIsLoggingEnabled = FALSE;
-
-/* Macro to log the Data */
-#define AOCL_DTL_START_PERF_TIMER() \
-    AOCL_DTL_start_perf_timer()
-#else
-/* Dummy macro definition if the AOCL_DTL_LOG_ENABLE macro is not enabled */
-#define AOCL_DTL_START_PERF_TIMER()
-#endif
-
-/* Macro to initialize the prerequisite for debuging */
-#ifdef AOCL_DTL_INITIALIZE_ENABLE
-#define AOCL_DTL_INITIALIZE(CURRENT_LOG_LEVEL) \
-    DTL_Initialize(CURRENT_LOG_LEVEL);
-#else
-/* Dummy macro definition if the AOCL_DTL_INITIALIZE macro is not enabled */
-#define AOCL_DTL_INITIALIZE(CURRENT_LOG_LEVEL)
-#endif
-
-/* Macro for uninitializing the prerequisite */
-#ifdef AOCL_DTL_INITIALIZE_ENABLE
-#define AOCL_DTL_UNINITIALIZE() \
-    DTL_Uninitialize();
-#else
-/* Dummy macro definition if the AOCL_DTL_INITIALIZE macro is not enabled */
-#define AOCL_DTL_UNINITIALIZE()
-#endif
-
-#ifdef AOCL_DTL_INITIALIZE_ENABLE
-/* Prototypes for initializing and uninitializing the debug functions */
-void DTL_Initialize(
-    uint32 ui32CurrentLogLevel);
-void DTL_Uninitialize(void);
-#endif
-
-#if (AOCL_DTL_TRACE_ENABLE || AOCL_DTL_LOG_ENABLE)
-/* Debug trace Function protoypes */
-void DTL_Trace(
-    uint8 ui8LogLevel,
-    uint8 ui8LogType,
-    const int8 *pi8FileName,
-    const int8 *pi8FunctionName,
-    uint32 ui32LineNumber,
-    const int8 *pi8Message);
-
-#endif
-
-#if AOCL_DTL_DUMP_ENABLE
-/* Function Prototype for dumping the data */
-void DTL_DumpData(
-    uint8 ui8LogLevel,
-    void *pvBuffer,
-    uint32 ui32BufferSize,
-    uint8 ui8DataType,
-    int8 *pi8Message,
-    int8 i8OutputType);
-#endif
-
-#endif /* _AOCLDTL_H_ */
-
-/* --------------- End of aocldtl.h ----------------- */
+/*===================================================================
+ * File Name :  aocldtl.h
+ *
+ * Description : This is main interface file for the end user
+ *               It provides defination for all macros to be
+ *               used by user to add debug/trace information.
+ *
+ * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ *==================================================================*/
+
+#ifndef _AOCLDTL_H_
+#define _AOCLDTL_H_
+
+#include "aocldtlcf.h"
+#include "aocltpdef.h"
+#include "aoclflist.h"
+#include "aoclos.h"
+
+#define TRACE_TYPE_FENTRY           (1)
+#define TRACE_TYPE_FEXIT            (2)
+#define TRACE_TYPE_LOG              (3)
+#define TRACE_TYPE_RAW              (4)
+
+/* Type definition for printf */
+#define AOCL_DEBUGPRINT printf
+
+/* Define the AOCL_DTL_INITIALIZE_ENABLE if any of the debug macro
+ * are defined */
+#if (AOCL_DTL_TRACE_ENABLE || AOCL_DTL_DUMP_ENABLE || AOCL_DTL_LOG_ENABLE)
+#define AOCL_DTL_INITIALIZE_ENABLE
+#endif
+
+#if AOCL_DTL_TRACE_ENABLE
+/* Entry macro to trace the flow of control The parameter LogLevel specifies
+      the log level String will preferably contains the function name in which
+      this macro is invoked */
+#define AOCL_DTL_TRACE_ENTRY(LogLevel) \
+    DTL_Trace(LogLevel,                \
+              TRACE_TYPE_FENTRY,       \
+              __FILE__,                \
+              __FUNCTION__,            \
+              __LINE__,                \
+              NULL);
+#else
+/* Dummy macro definition if the AOCL_DTL_TRACE_ENABLE macro is not enabled */
+#define AOCL_DTL_TRACE_ENTRY(LogLevel)
+#endif
+
+#if AOCL_DTL_TRACE_ENABLE
+/* Exit macro to trace the flow of control The parameter LogLevel specifies
+      log level String will preferably contains the function name in which this
+      macro is invoked */
+#define AOCL_DTL_TRACE_EXIT(LogLevel) \
+    DTL_Trace(LogLevel,               \
+              TRACE_TYPE_FEXIT,       \
+              __FILE__,               \
+              __FUNCTION__,           \
+              __LINE__,               \
+              NULL);
+
+#define AOCL_DTL_TRACE_EXIT_ERR(LogLevel, Message) \
+    DTL_Trace(LogLevel,                            \
+              TRACE_TYPE_FEXIT,                    \
+              __FILE__,                            \
+              __FUNCTION__,                        \
+              __LINE__,                            \
+              Message);
+#else
+/* Dummy macro definition if the AOCL_DTL_TRACE_ENABLE macro is not enabled */
+#define AOCL_DTL_TRACE_EXIT(LogLevel)
+#define AOCL_DTL_TRACE_EXIT_ERR(LogLevel, Message)
+#endif
+
+#if AOCL_DTL_DUMP_ENABLE
+/* Macro to Dump the DATA The parameters  Buffer contains the data to be
+      dumped BufferSize specifies the no. of bytes to be dumped DataType
+      specifies the data type of Buffer */
+#define AOCL_DTL_DUMP(LogLevel, Buffer, BufferSize, DataType, String, OutputType) \
+    /* Call the Dump function to Dump the DATA */                                 \
+    DTL_DumpData(LogLevel,                                                        \
+                 Buffer,                                                          \
+                 BufferSize,                                                      \
+                 DataType,                                                        \
+                 String,                                                          \
+                 OutputType);
+#else
+/* Dummy macro definition if the AOCL_DTL_DUMP_ENABLE macro is not enabled */
+#define AOCL_DTL_DUMP(Buffer, BufferSize, DataType, String, OutputType)
+
+#endif
+
+#if AOCL_DTL_LOG_ENABLE
+/* Macro to log the Data */
+#define AOCL_DTL_LOG(LogLevel, Message) \
+    DTL_Trace(LogLevel,                 \
+              TRACE_TYPE_LOG,           \
+              __FILE__,                 \
+              __FUNCTION__,             \
+              __LINE__,                 \
+              Message);
+#else
+/* Dummy macro definition if the AOCL_DTL_LOG_ENABLE macro is not enabled */
+#define AOCL_DTL_LOG(LogLevel, Message)
+#endif
+
+#if AOCL_DTL_LOG_ENABLE
+
+void AOCL_DTL_start_perf_timer(void);
+uint64 AOCL_DTL_get_time_spent(void);
+
+/*
+ * Logging of inputs can be enabled by two methods:
+ *
+ * 1. Using environment variable AOCL_VERBOSE.
+ * 2. APIs
+ * 
+ * The API takes precedence over environment variable.
+ * 
+ * The global flag is maintain in the code to track the final
+ * state of the logging feature.
+ */
+extern Bool gbIsLoggingEnabled;
+
+/* API to enable logging at runtime */
+#define AOCL_DTL_Enable_Logs() \
+    /* Initialize DTL if not alredy done so */ \
+    AOCL_DTL_INITIALIZE(AOCL_DTL_TRACE_LEVEL); \
+    gbIsLoggingEnabled = TRUE;
+
+/* API to disable logging at runtime */
+#define AOCL_DTL_Disable_Logs() \
+    /* Initialize DTL if not alredy done so */ \
+    AOCL_DTL_INITIALIZE(AOCL_DTL_TRACE_LEVEL); \
+    gbIsLoggingEnabled = FALSE;
+
+/* Macro to log the Data */
+#define AOCL_DTL_START_PERF_TIMER() \
+    AOCL_DTL_start_perf_timer()
+#else
+/* Dummy macro definition if the AOCL_DTL_LOG_ENABLE macro is not enabled */
+#define AOCL_DTL_START_PERF_TIMER()
+#endif
+
+/* Macro to initialize the prerequisite for debuging */
+#ifdef AOCL_DTL_INITIALIZE_ENABLE
+#define AOCL_DTL_INITIALIZE(CURRENT_LOG_LEVEL) \
+    DTL_Initialize(CURRENT_LOG_LEVEL);
+#else
+/* Dummy macro definition if the AOCL_DTL_INITIALIZE macro is not enabled */
+#define AOCL_DTL_INITIALIZE(CURRENT_LOG_LEVEL)
+#endif
+
+/* Macro for uninitializing the prerequisite */
+#ifdef AOCL_DTL_INITIALIZE_ENABLE
+#define AOCL_DTL_UNINITIALIZE() \
+    DTL_Uninitialize();
+#else
+/* Dummy macro definition if the AOCL_DTL_INITIALIZE macro is not enabled */
+#define AOCL_DTL_UNINITIALIZE()
+#endif
+
+#ifdef AOCL_DTL_INITIALIZE_ENABLE
+/* Prototypes for initializing and uninitializing the debug functions */
+void DTL_Initialize(
+    uint32 ui32CurrentLogLevel);
+void DTL_Uninitialize(void);
+#endif
+
+#if (AOCL_DTL_TRACE_ENABLE || AOCL_DTL_LOG_ENABLE)
+/* Debug trace Function protoypes */
+void DTL_Trace(
+    uint8 ui8LogLevel,
+    uint8 ui8LogType,
+    const int8 *pi8FileName,
+    const int8 *pi8FunctionName,
+    uint32 ui32LineNumber,
+    const int8 *pi8Message);
+
+#endif
+
+#if AOCL_DTL_DUMP_ENABLE
+/* Function Prototype for dumping the data */
+void DTL_DumpData(
+    uint8 ui8LogLevel,
+    void *pvBuffer,
+    uint32 ui32BufferSize,
+    uint8 ui8DataType,
+    int8 *pi8Message,
+    int8 i8OutputType);
+#endif
+
+#endif /* _AOCLDTL_H_ */
+
+/* --------------- End of aocldtl.h ----------------- */
diff --git a/aocl_dtl/aocldtlcf.h b/aocl_dtl/aocldtlcf.h
index 1f44f54405..408f38c516 100644
--- a/aocl_dtl/aocldtlcf.h
+++ b/aocl_dtl/aocldtlcf.h
@@ -1,77 +1,77 @@
-/*===================================================================
- * File Name :  aocldtlcf.h
- *
- * Description : This is configuration file for debug and trace
- *               libaray, all debug features (except auto trace)
- *               can be enabled/disabled in this file.
- *
- * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
- *
- *==================================================================*/
-
-#ifndef _AOCLDTLCF_H_
-#define _AOCLDTLCF_H_
-
-/* Macro for tracing the log If the user wants to enable tracing he has to
-   enable this macro by making it to 1 else 0 */
-#define AOCL_DTL_TRACE_ENABLE       0
-
-/* Macro for dumping the log If the user wants to enable dumping he has to
-   enable this macro by making it to 1 else 0 */
-#define AOCL_DTL_DUMP_ENABLE        0
-
-/* Macro for dumping the log If the user wants to enable input logs he has to
-   enable this macro by making it to 1 else 0 */
-#define AOCL_DTL_LOG_ENABLE         0
-
-/* Select the trace level till which you want to log the data */
-/* By default it will log for all levels */
-#define AOCL_DTL_TRACE_LEVEL         AOCL_DTL_LEVEL_TRACE_5
-
-/* user has to explicitly use the below macros to identify
-   ciriticality of the logged message */
-#define AOCL_DTL_LEVEL_ALL          (15)
-#define AOCL_DTL_LEVEL_TRACE_9      (14)
-#define AOCL_DTL_LEVEL_TRACE_8      (13)
-#define AOCL_DTL_LEVEL_TRACE_7      (12)     /* Kernels */
-#define AOCL_DTL_LEVEL_TRACE_6      (11)
-#define AOCL_DTL_LEVEL_TRACE_5      (10)
-#define AOCL_DTL_LEVEL_TRACE_4      (9)
-#define AOCL_DTL_LEVEL_TRACE_3      (8)
-#define AOCL_DTL_LEVEL_TRACE_2      (7)
-#define AOCL_DTL_LEVEL_TRACE_1      (6)       /* BLIS/BLAS API */
-#define AOCL_DTL_LEVEL_VERBOSE      (5)
-#define AOCL_DTL_LEVEL_INFO         (4)
-#define AOCL_DTL_LEVEL_MINOR        (3)
-#define AOCL_DTL_LEVEL_MAJOR        (2)
-#define AOCL_DTL_LEVEL_CRITICAL     (1)
-
-
-#define AOCL_DTL_TRACE_FILE         "aocldtl_trace.txt"
-#define AOCL_DTL_AUTO_TRACE_FILE    "aocldtl_auto_trace.rawfile"
-#define AOCL_DTL_LOG_FILE           "aocldtl_log.txt"
-
-/* The use can use below three macros for different data type while dumping data
- * or specify the size of data type in bytes macro for character data type */
-#define AOCL_CHAR_DATA_TYPE         (1)
-
-/* macro for short data type */
-#define AOCL_UINT16_DATA_TYPE       (2)
-
-/* macro for String data type */
-#define AOCL_STRING_DATA_TYPE       (3)
-
-/* macro for uint32 data type */
-#define AOCL_UINT32_DATA_TYPE       (4)
-
-/* macro for printing Hex values */
-#define AOCL_LOG_HEX_VALUE          ('x')
-
-/* macro for printing Decimal values */
-#define AOCL_LOG_DECIMAL_VALUE      ('d')
-
-
-
-#endif /* _AOCLDTLCF_H_ */
-
-/* --------------- End of aocldtlcf.h ----------------- */
+/*===================================================================
+ * File Name :  aocldtlcf.h
+ *
+ * Description : This is configuration file for debug and trace
+ *               libaray, all debug features (except auto trace)
+ *               can be enabled/disabled in this file.
+ *
+ * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ *==================================================================*/
+
+#ifndef _AOCLDTLCF_H_
+#define _AOCLDTLCF_H_
+
+/* Macro for tracing the log If the user wants to enable tracing he has to
+   enable this macro by making it to 1 else 0 */
+#define AOCL_DTL_TRACE_ENABLE       0
+
+/* Macro for dumping the log If the user wants to enable dumping he has to
+   enable this macro by making it to 1 else 0 */
+#define AOCL_DTL_DUMP_ENABLE        0
+
+/* Macro for dumping the log If the user wants to enable input logs he has to
+   enable this macro by making it to 1 else 0 */
+#define AOCL_DTL_LOG_ENABLE         0
+
+/* Select the trace level till which you want to log the data */
+/* By default it will log for all levels */
+#define AOCL_DTL_TRACE_LEVEL         AOCL_DTL_LEVEL_TRACE_5
+
+/* user has to explicitly use the below macros to identify
+   ciriticality of the logged message */
+#define AOCL_DTL_LEVEL_ALL          (15)
+#define AOCL_DTL_LEVEL_TRACE_9      (14)
+#define AOCL_DTL_LEVEL_TRACE_8      (13)
+#define AOCL_DTL_LEVEL_TRACE_7      (12)     /* Kernels */
+#define AOCL_DTL_LEVEL_TRACE_6      (11)
+#define AOCL_DTL_LEVEL_TRACE_5      (10)
+#define AOCL_DTL_LEVEL_TRACE_4      (9)
+#define AOCL_DTL_LEVEL_TRACE_3      (8)
+#define AOCL_DTL_LEVEL_TRACE_2      (7)
+#define AOCL_DTL_LEVEL_TRACE_1      (6)       /* BLIS/BLAS API */
+#define AOCL_DTL_LEVEL_VERBOSE      (5)
+#define AOCL_DTL_LEVEL_INFO         (4)
+#define AOCL_DTL_LEVEL_MINOR        (3)
+#define AOCL_DTL_LEVEL_MAJOR        (2)
+#define AOCL_DTL_LEVEL_CRITICAL     (1)
+
+
+#define AOCL_DTL_TRACE_FILE         "aocldtl_trace.txt"
+#define AOCL_DTL_AUTO_TRACE_FILE    "aocldtl_auto_trace.rawfile"
+#define AOCL_DTL_LOG_FILE           "aocldtl_log.txt"
+
+/* The use can use below three macros for different data type while dumping data
+ * or specify the size of data type in bytes macro for character data type */
+#define AOCL_CHAR_DATA_TYPE         (1)
+
+/* macro for short data type */
+#define AOCL_UINT16_DATA_TYPE       (2)
+
+/* macro for String data type */
+#define AOCL_STRING_DATA_TYPE       (3)
+
+/* macro for uint32 data type */
+#define AOCL_UINT32_DATA_TYPE       (4)
+
+/* macro for printing Hex values */
+#define AOCL_LOG_HEX_VALUE          ('x')
+
+/* macro for printing Decimal values */
+#define AOCL_LOG_DECIMAL_VALUE      ('d')
+
+
+
+#endif /* _AOCLDTLCF_H_ */
+
+/* --------------- End of aocldtlcf.h ----------------- */
diff --git a/aocl_dtl/aoclfal.c b/aocl_dtl/aoclfal.c
index a317e69cbd..1eadf99b49 100644
--- a/aocl_dtl/aoclfal.c
+++ b/aocl_dtl/aoclfal.c
@@ -1,265 +1,265 @@
-/*===================================================================
- * File Name :  aoclfal.c
- * 
- * Description : Platform/os independed file handling API's
- *
- * Copyright (C) 2020, Advanced Micro Devices, Inc
- * 
- *==================================================================*/
-
-#include "aocltpdef.h"
-#include "aocldtl.h"
-#include "aoclfal.h"
-
-
-
-/* Disable instrumentation for following function, since they are called from 
- * Auto Generated execution trace handlers. */
-
-/* The FAL function declaration */
-int32 AOCL_FAL_Close(
-    AOCL_FAL_FILE *fpFilePointer) __attribute__((no_instrument_function));
-
-int32 AOCL_FAL_Error(
-    AOCL_FAL_FILE *fpFilePointer) __attribute__((no_instrument_function));
-
-AOCL_FAL_FILE *AOCL_FAL_Open(
-    const int8 *pchFileName,
-    const int8 *pchMode) __attribute__((no_instrument_function));
-
-int32 AOCL_FAL_Read(
-    void *pvBuffer,
-    int32 i32Size,
-    int32 i32Count,
-    AOCL_FAL_FILE *fpFilePointer) __attribute__((no_instrument_function));
-
-int32 AOCL_FAL_Write(
-    const void *pvBuffer,
-    int32 i32Size,
-    int32 iCount,
-    AOCL_FAL_FILE *fpFilePointer) __attribute__((no_instrument_function));
-
-/*=============================================================================
-*  Function Name       :   AOCL_FAL_Open
-*  Description         :   Used for opening a file specified by name
-*  Input Parameter(s)  :   int8 *pchFileName - Stores the file name (path)
-*                          int8 *pchMode - Specify the mode for opening file
-*  Output Parameter(s) :   None
-*  Return parameter(s) :   AOCL_FAL_FILE - If the file is opened successfully
-*                          NULL - If there is any error while opening file
-*============================================================================*/
-AOCL_FAL_FILE *AOCL_FAL_Open(
-    const int8 *pchFileName,
-    const int8 *pchMode)
-{
-    AOCL_FAL_FILE *fpFileOpen = NULL;
-    /* Open the file with provided by specified path and mode in which it should
-      be opened. Refer to FILE I/O operation help for getting mode types */
-    fpFileOpen = fopen(pchFileName, pchMode);
-    /* If the file is not opened then NULL value should be returned */
-    if (NULL == fpFileOpen)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Cannot open file: AOCL_FAL_Open()");
-    }
-    return fpFileOpen;
-} /* end of AOCL_FAL_Open */
-
-/*=============================================================================
-*  Function Name       :   AOCL_FAL_Close
-*  Description         :   Used for closing a file specified by file pointer
-*  Input Parameter(s)  :   AOCL_FAL_FILE *fpFilePointer - File pointer
-*  Output Parameter(s) :   None
-*  Return parameter(s) :   0 - If the file is closed successfully
-*                          AOCL_FAL_CLOSE_ERROR - For any error while closing file
-*
-*============================================================================*/
-int32 AOCL_FAL_Close(
-    AOCL_FAL_FILE *fpFilePointer)
-{
-    /* Return value for the file close */
-    int32 i32RetVal;
-    i32RetVal = AOCL_FAL_CLOSE_ERROR;
-
-    /* Check whether the file pointer passed is valid or not */
-    if (NULL == fpFilePointer)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Can not close file: AOCL_FAL_Close()");
-        return i32RetVal;
-    }
-
-    /* Close the file using the FILE pointer passed */
-    i32RetVal = fclose(fpFilePointer);
-
-    /* If the return value is non zero then it indicates an error */
-    if (i32RetVal)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR,
-                     "Can't close file, Invalid file pointer passed");
-        return i32RetVal;
-    }
-
-    /* On successful closing of the file, function should return 0 */
-    return i32RetVal;
-
-} /* End of AOCL_FAL_Close */
-
-/*=============================================================================
-*  Function Name       :   AOCL_FAL_Read
-*  Description         :   Used for reading a file specified by file pointer.
-*                          This function reads the specified number of bytes
-*                          from the file into the buffer specified. The bytes
-*                          read are returned by this function.
-*  Input Parameter(s)  :   int32 i32Size - Item size in bytes
-*                          int32 i32Count - Maximum number of items to be read
-*                          AOCL_FAL_FILE *fpFilePointer - File ptr to read from
-*  Output Parameter(s) :   void *pvBuffer - Storage location of data
-*  Return parameter(s) :   i32RetVal - Number of bytes read if successful
-*                          AOCL_FAL_READ_ERROR - In case of error while reading
-*============================================================================*/
-int32 AOCL_FAL_Read(
-    void *pvBuffer,
-    int32 i32Size,
-    int32 i32Count,
-    AOCL_FAL_FILE *fpFilePointer)
-{
-    /* Return value for the file read */
-    int32 i32RetVal;
-    i32RetVal = AOCL_FAL_READ_ERROR;
-
-    /* Check pointer used for pointing the storage location data is valid */
-    if (NULL == pvBuffer)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR,
-                     "Can not read the file, Buffer pointer is NULL");
-        return i32RetVal;
-    }
-
-    /* Check whether file pointer passed is valid */
-    if (NULL == fpFilePointer)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR,
-                     "Can not read the file, Buffer pointer is NULL");
-        return i32RetVal;
-    }
-
-    /* Read the file using file pointer */
-    i32RetVal = fread(pvBuffer, i32Size, i32Count, fpFilePointer);
-
-    if (i32RetVal != i32Count)
-    {
-        /* Check whether this is an end of file The AOCL_FAL_Error() will return
-         non-zero value to indicate an error */
-        if (AOCL_FAL_Error(fpFilePointer)) /* AOCL_FAL_EndOfFile (fpFilePointer) */
-        {
-            AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR,
-                         "There is an error condition while file read");
-            i32RetVal = AOCL_FAL_READ_ERROR;
-        }
-        /* This is condition where file read has encountered an end of file */
-        else
-        {
-            AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "End of file...");
-        }
-    }
-
-    /* The number of bytes read by the file read operation.
-    * This value may be less than the actual count, due to end of file
-    * or an error while reading the file */
-    return i32RetVal;
-
-} /* End of AOCL_FAL_Read */
-
-/*=============================================================================
-*  Function Name       :   AOCL_FAL_Write
-*  Description         :   Used for writing data to a file specified by file
-*                          pointer. The number of bytes written to file are
-*                          written by this function.
-*  Input Parameter(s)  :   const void *pvBuffer - Pointer to data location from
-*                                                 where the data to be copied
-                           int32 i32Size - Item size in bytes
-*                          int32 i32Count - Maximum number of items to be
-*                                           written
-*                          AOCL_FAL_FILE *fpFilePointer - File pointer to write to
-*  Output Parameter(s) :   None
-*  Return parameter(s) :   i32RetVal - Number of bytes written if successful
-*                          AOCL_FAL_WRITE_ERROR - In case of error while writing
-*============================================================================*/
-int32 AOCL_FAL_Write(
-    const void *pvBuffer,
-    int32 i32Size,
-    int32 iCount,
-    AOCL_FAL_FILE *fpFilePointer)
-{
-    /* Return value for write operation */
-    int32 i32RetVal;
-    i32RetVal = AOCL_FAL_WRITE_ERROR;
-    /* Check pointer used for pointing the storage location data is valid */
-    if (NULL == pvBuffer)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Can not perform file write");
-        return i32RetVal;
-    }
-
-    /* Check whether the file pointer passed is valid or not */
-    if (NULL == fpFilePointer)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Can not perform file write");
-        return i32RetVal;
-    }
-
-    /* Write into the file specified by the file pointer */
-    i32RetVal = fwrite(pvBuffer, i32Size, iCount, fpFilePointer);
-
-    /* If the number of bytes written into the file are less than specified
-    * bytes then it is an error while file writing */
-    if (i32RetVal != iCount)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "File write operation error");
-        i32RetVal = AOCL_FAL_WRITE_ERROR;
-    }
-
-    /* The return value of the file write operation */
-    return i32RetVal;
-
-} /* End of AOCL_FAL_Write */
-
-/*=============================================================================
-*  Function Name       :   AOCL_FAL_Error
-*  Description         :   Used for testing an error on the file specified
-*  Input Parameter(s)  :   AOCL_FAL_FILE *fpFilePointer - File pointer
-*  Output Parameter(s) :   None
-*  Return parameter(s) :   non-zero - Indicates an end of file
-*                          0 - Indicates that function is successful
-*                          non-zero - Indicates that there is some error
-*                          AOCL_FAL_ERROR - Indicates error during the operation
-*============================================================================*/
-int32 AOCL_FAL_Error(
-    AOCL_FAL_FILE *fpFilePointer)
-{
-    /* Used for storing the return value for ferror function */
-    int32 i32RetVal;
-    i32RetVal = AOCL_FAL_FERROR;
-
-    /* Check whether the file pointer is NULL */
-    if (NULL == fpFilePointer)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Invalid file pointer is passed");
-        return i32RetVal;
-    }
-
-    /* Call the ferror function to get an error on the file */
-    i32RetVal = ferror(fpFilePointer);
-
-    /* Check for the return value, it non-zero there is an error */
-    if (i32RetVal)
-    {
-        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "The file has some error");
-        i32RetVal = AOCL_FAL_FERROR;
-    }
-
-    /* In case of success, this function should return 0 */
-    return i32RetVal;
-
-} /* End of AOCL_FAL_Error */
-
-/* ------------------- End of aoclfal.c ----------------------- */
+/*===================================================================
+ * File Name :  aoclfal.c
+ * 
+ * Description : Platform/os independed file handling API's
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc
+ * 
+ *==================================================================*/
+
+#include "aocltpdef.h"
+#include "aocldtl.h"
+#include "aoclfal.h"
+
+
+
+/* Disable instrumentation for following function, since they are called from 
+ * Auto Generated execution trace handlers. */
+
+/* The FAL function declaration */
+int32 AOCL_FAL_Close(
+    AOCL_FAL_FILE *fpFilePointer) __attribute__((no_instrument_function));
+
+int32 AOCL_FAL_Error(
+    AOCL_FAL_FILE *fpFilePointer) __attribute__((no_instrument_function));
+
+AOCL_FAL_FILE *AOCL_FAL_Open(
+    const int8 *pchFileName,
+    const int8 *pchMode) __attribute__((no_instrument_function));
+
+int32 AOCL_FAL_Read(
+    void *pvBuffer,
+    int32 i32Size,
+    int32 i32Count,
+    AOCL_FAL_FILE *fpFilePointer) __attribute__((no_instrument_function));
+
+int32 AOCL_FAL_Write(
+    const void *pvBuffer,
+    int32 i32Size,
+    int32 iCount,
+    AOCL_FAL_FILE *fpFilePointer) __attribute__((no_instrument_function));
+
+/*=============================================================================
+*  Function Name       :   AOCL_FAL_Open
+*  Description         :   Used for opening a file specified by name
+*  Input Parameter(s)  :   int8 *pchFileName - Stores the file name (path)
+*                          int8 *pchMode - Specify the mode for opening file
+*  Output Parameter(s) :   None
+*  Return parameter(s) :   AOCL_FAL_FILE - If the file is opened successfully
+*                          NULL - If there is any error while opening file
+*============================================================================*/
+AOCL_FAL_FILE *AOCL_FAL_Open(
+    const int8 *pchFileName,
+    const int8 *pchMode)
+{
+    AOCL_FAL_FILE *fpFileOpen = NULL;
+    /* Open the file with provided by specified path and mode in which it should
+      be opened. Refer to FILE I/O operation help for getting mode types */
+    fpFileOpen = fopen(pchFileName, pchMode);
+    /* If the file is not opened then NULL value should be returned */
+    if (NULL == fpFileOpen)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Cannot open file: AOCL_FAL_Open()");
+    }
+    return fpFileOpen;
+} /* end of AOCL_FAL_Open */
+
+/*=============================================================================
+*  Function Name       :   AOCL_FAL_Close
+*  Description         :   Used for closing a file specified by file pointer
+*  Input Parameter(s)  :   AOCL_FAL_FILE *fpFilePointer - File pointer
+*  Output Parameter(s) :   None
+*  Return parameter(s) :   0 - If the file is closed successfully
+*                          AOCL_FAL_CLOSE_ERROR - For any error while closing file
+*
+*============================================================================*/
+int32 AOCL_FAL_Close(
+    AOCL_FAL_FILE *fpFilePointer)
+{
+    /* Return value for the file close */
+    int32 i32RetVal;
+    i32RetVal = AOCL_FAL_CLOSE_ERROR;
+
+    /* Check whether the file pointer passed is valid or not */
+    if (NULL == fpFilePointer)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Can not close file: AOCL_FAL_Close()");
+        return i32RetVal;
+    }
+
+    /* Close the file using the FILE pointer passed */
+    i32RetVal = fclose(fpFilePointer);
+
+    /* If the return value is non zero then it indicates an error */
+    if (i32RetVal)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR,
+                     "Can't close file, Invalid file pointer passed");
+        return i32RetVal;
+    }
+
+    /* On successful closing of the file, function should return 0 */
+    return i32RetVal;
+
+} /* End of AOCL_FAL_Close */
+
+/*=============================================================================
+*  Function Name       :   AOCL_FAL_Read
+*  Description         :   Used for reading a file specified by file pointer.
+*                          This function reads the specified number of bytes
+*                          from the file into the buffer specified. The bytes
+*                          read are returned by this function.
+*  Input Parameter(s)  :   int32 i32Size - Item size in bytes
+*                          int32 i32Count - Maximum number of items to be read
+*                          AOCL_FAL_FILE *fpFilePointer - File ptr to read from
+*  Output Parameter(s) :   void *pvBuffer - Storage location of data
+*  Return parameter(s) :   i32RetVal - Number of bytes read if successful
+*                          AOCL_FAL_READ_ERROR - In case of error while reading
+*============================================================================*/
+int32 AOCL_FAL_Read(
+    void *pvBuffer,
+    int32 i32Size,
+    int32 i32Count,
+    AOCL_FAL_FILE *fpFilePointer)
+{
+    /* Return value for the file read */
+    int32 i32RetVal;
+    i32RetVal = AOCL_FAL_READ_ERROR;
+
+    /* Check pointer used for pointing the storage location data is valid */
+    if (NULL == pvBuffer)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR,
+                     "Can not read the file, Buffer pointer is NULL");
+        return i32RetVal;
+    }
+
+    /* Check whether file pointer passed is valid */
+    if (NULL == fpFilePointer)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR,
+                     "Can not read the file, Buffer pointer is NULL");
+        return i32RetVal;
+    }
+
+    /* Read the file using file pointer */
+    i32RetVal = fread(pvBuffer, i32Size, i32Count, fpFilePointer);
+
+    if (i32RetVal != i32Count)
+    {
+        /* Check whether this is an end of file The AOCL_FAL_Error() will return
+         non-zero value to indicate an error */
+        if (AOCL_FAL_Error(fpFilePointer)) /* AOCL_FAL_EndOfFile (fpFilePointer) */
+        {
+            AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR,
+                         "There is an error condition while file read");
+            i32RetVal = AOCL_FAL_READ_ERROR;
+        }
+        /* This is condition where file read has encountered an end of file */
+        else
+        {
+            AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "End of file...");
+        }
+    }
+
+    /* The number of bytes read by the file read operation.
+    * This value may be less than the actual count, due to end of file
+    * or an error while reading the file */
+    return i32RetVal;
+
+} /* End of AOCL_FAL_Read */
+
+/*=============================================================================
+*  Function Name       :   AOCL_FAL_Write
+*  Description         :   Used for writing data to a file specified by file
+*                          pointer. The number of bytes written to file are
+*                          written by this function.
+*  Input Parameter(s)  :   const void *pvBuffer - Pointer to data location from
+*                                                 where the data to be copied
+                           int32 i32Size - Item size in bytes
+*                          int32 i32Count - Maximum number of items to be
+*                                           written
+*                          AOCL_FAL_FILE *fpFilePointer - File pointer to write to
+*  Output Parameter(s) :   None
+*  Return parameter(s) :   i32RetVal - Number of bytes written if successful
+*                          AOCL_FAL_WRITE_ERROR - In case of error while writing
+*============================================================================*/
+int32 AOCL_FAL_Write(
+    const void *pvBuffer,
+    int32 i32Size,
+    int32 iCount,
+    AOCL_FAL_FILE *fpFilePointer)
+{
+    /* Return value for write operation */
+    int32 i32RetVal;
+    i32RetVal = AOCL_FAL_WRITE_ERROR;
+    /* Check pointer used for pointing the storage location data is valid */
+    if (NULL == pvBuffer)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Can not perform file write");
+        return i32RetVal;
+    }
+
+    /* Check whether the file pointer passed is valid or not */
+    if (NULL == fpFilePointer)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Can not perform file write");
+        return i32RetVal;
+    }
+
+    /* Write into the file specified by the file pointer */
+    i32RetVal = fwrite(pvBuffer, i32Size, iCount, fpFilePointer);
+
+    /* If the number of bytes written into the file are less than specified
+    * bytes then it is an error while file writing */
+    if (i32RetVal != iCount)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "File write operation error");
+        i32RetVal = AOCL_FAL_WRITE_ERROR;
+    }
+
+    /* The return value of the file write operation */
+    return i32RetVal;
+
+} /* End of AOCL_FAL_Write */
+
+/*=============================================================================
+*  Function Name       :   AOCL_FAL_Error
+*  Description         :   Used for testing an error on the file specified
+*  Input Parameter(s)  :   AOCL_FAL_FILE *fpFilePointer - File pointer
+*  Output Parameter(s) :   None
+*  Return parameter(s) :   non-zero - Indicates an end of file
+*                          0 - Indicates that function is successful
+*                          non-zero - Indicates that there is some error
+*                          AOCL_FAL_ERROR - Indicates error during the operation
+*============================================================================*/
+int32 AOCL_FAL_Error(
+    AOCL_FAL_FILE *fpFilePointer)
+{
+    /* Used for storing the return value for ferror function */
+    int32 i32RetVal;
+    i32RetVal = AOCL_FAL_FERROR;
+
+    /* Check whether the file pointer is NULL */
+    if (NULL == fpFilePointer)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "Invalid file pointer is passed");
+        return i32RetVal;
+    }
+
+    /* Call the ferror function to get an error on the file */
+    i32RetVal = ferror(fpFilePointer);
+
+    /* Check for the return value, it non-zero there is an error */
+    if (i32RetVal)
+    {
+        AOCL_DTL_LOG(AOCL_DTL_LEVEL_MAJOR, "The file has some error");
+        i32RetVal = AOCL_FAL_FERROR;
+    }
+
+    /* In case of success, this function should return 0 */
+    return i32RetVal;
+
+} /* End of AOCL_FAL_Error */
+
+/* ------------------- End of aoclfal.c ----------------------- */
diff --git a/aocl_dtl/aoclfal.h b/aocl_dtl/aoclfal.h
index 9b8074528d..401ed4c355 100644
--- a/aocl_dtl/aoclfal.h
+++ b/aocl_dtl/aoclfal.h
@@ -1,50 +1,50 @@
-/*===================================================================
- * File Name :  aoclfal.h
- * 
- * Description : Interfaces for platform/os independed file 
- *               handling API's
- *
- * Copyright (C) 2020, Advanced Micro Devices, Inc
- * 
- *==================================================================*/
-
-#ifndef _AOCL_FAL_H_
-#define _AOCL_FAL_H_
-
-/* The possible error values of FAL */
-#define AOCL_FAL_SUCCESS             0
-#define AOCL_FAL_CLOSE_ERROR        -1
-#define AOCL_FAL_READ_ERROR         -2
-#define AOCL_FAL_WRITE_ERROR        -3
-#define AOCL_FAL_EOF_ERROR          -6
-#define AOCL_FAL_FERROR             -7
-
-/* The type definition for FILE */
-#define AOCL_FAL_FILE FILE
-
-/* The FAL function declaration */
-int32 AOCL_FAL_Close(
-    AOCL_FAL_FILE *fpFilePointer);
-
-int32 AOCL_FAL_Error(
-    AOCL_FAL_FILE *fpFilePointer);
-
-AOCL_FAL_FILE *AOCL_FAL_Open(
-    const int8 *pchFileName,
-    const int8 *pchMode);
-
-int32 AOCL_FAL_Read(
-    void *pvBuffer,
-    int32 i32Size,
-    int32 i32Count,
-    AOCL_FAL_FILE *fpFilePointer);
-
-int32 AOCL_FAL_Write(
-    const void *pvBuffer,
-    int32 i32Size,
-    int32 iCount,
-    AOCL_FAL_FILE *fpFilePointer);
-
-#endif /* _AOCL_FAL_H_ */
-
-/* --------------- End of aoclfal.h ----------------- */
+/*===================================================================
+ * File Name :  aoclfal.h
+ * 
+ * Description : Interfaces for platform/os independed file 
+ *               handling API's
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc
+ * 
+ *==================================================================*/
+
+#ifndef _AOCL_FAL_H_
+#define _AOCL_FAL_H_
+
+/* The possible error values of FAL */
+#define AOCL_FAL_SUCCESS             0
+#define AOCL_FAL_CLOSE_ERROR        -1
+#define AOCL_FAL_READ_ERROR         -2
+#define AOCL_FAL_WRITE_ERROR        -3
+#define AOCL_FAL_EOF_ERROR          -6
+#define AOCL_FAL_FERROR             -7
+
+/* The type definition for FILE */
+#define AOCL_FAL_FILE FILE
+
+/* The FAL function declaration */
+int32 AOCL_FAL_Close(
+    AOCL_FAL_FILE *fpFilePointer);
+
+int32 AOCL_FAL_Error(
+    AOCL_FAL_FILE *fpFilePointer);
+
+AOCL_FAL_FILE *AOCL_FAL_Open(
+    const int8 *pchFileName,
+    const int8 *pchMode);
+
+int32 AOCL_FAL_Read(
+    void *pvBuffer,
+    int32 i32Size,
+    int32 i32Count,
+    AOCL_FAL_FILE *fpFilePointer);
+
+int32 AOCL_FAL_Write(
+    const void *pvBuffer,
+    int32 i32Size,
+    int32 iCount,
+    AOCL_FAL_FILE *fpFilePointer);
+
+#endif /* _AOCL_FAL_H_ */
+
+/* --------------- End of aoclfal.h ----------------- */
diff --git a/aocl_dtl/aocltpdef.h b/aocl_dtl/aocltpdef.h
index 7c08455369..d842fffbac 100644
--- a/aocl_dtl/aocltpdef.h
+++ b/aocl_dtl/aocltpdef.h
@@ -1,42 +1,42 @@
-
-/*===================================================================
- * File Name :  aocltpdef.h
- *
- * Description : Abstraction for various datatypes used by DTL.
- *
- * Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
- *
- *==================================================================*/
-#ifndef AOCL_TYPEDEF_H_
-#define AOCL_TYPEDEF_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <memory.h>
-#include <time.h>
-#include <math.h>
-#ifndef _WIN32
-#include <sys/types.h>
-#else
-typedef int pid_t;
-#endif
-
-typedef double                  Double;
-typedef float                   Float;
-typedef void                    Void;
-typedef unsigned char           uint8;
-typedef unsigned short int      uint16;
-typedef unsigned int            uint32;
-typedef unsigned long           uint64;
-typedef uint8                   *STRING;
-typedef unsigned char           Bool;
-typedef char                    int8;
-typedef signed long int         int32;
-typedef short int               int16;
-
-typedef Void                    *AOCL_HANDLE;
-typedef pid_t                   AOCL_TID;
-
-#endif /*AOCL_TYPEDEF_H_ */
-
-/* --------------- End of aocltpdef.h ----------------- */
+
+/*===================================================================
+ * File Name :  aocltpdef.h
+ *
+ * Description : Abstraction for various datatypes used by DTL.
+ *
+ * Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ *==================================================================*/
+#ifndef AOCL_TYPEDEF_H_
+#define AOCL_TYPEDEF_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <time.h>
+#include <math.h>
+#ifndef _WIN32
+#include <sys/types.h>
+#else
+typedef int pid_t;
+#endif
+
+typedef double                  Double;
+typedef float                   Float;
+typedef void                    Void;
+typedef unsigned char           uint8;
+typedef unsigned short int      uint16;
+typedef unsigned int            uint32;
+typedef unsigned long           uint64;
+typedef uint8                   *STRING;
+typedef unsigned char           Bool;
+typedef char                    int8;
+typedef signed long int         int32;
+typedef short int               int16;
+
+typedef Void                    *AOCL_HANDLE;
+typedef pid_t                   AOCL_TID;
+
+#endif /*AOCL_TYPEDEF_H_ */
+
+/* --------------- End of aocltpdef.h ----------------- */
diff --git a/aocl_dtl/test_dtl.c b/aocl_dtl/test_dtl.c
index 978f4ac44b..08ff3296c3 100644
--- a/aocl_dtl/test_dtl.c
+++ b/aocl_dtl/test_dtl.c
@@ -1,96 +1,96 @@
-/*===================================================================
- * File Name :  test_dtl.c
- * 
- * Description : Unit test cases for dtl.
- *
- * Copyright (C) 2020, Advanced Micro Devices, Inc
- * 
- *==================================================================*/
-
-#if 0 // Disable this for normal build.
-
-#include "aocltpdef.h"
-#include "aocldtl.h"
-
-int aocl_allocate(double**A, double** B, double** C, int N)
-{
-	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
-
-	*A = (double*)malloc(sizeof(double) * N);
-	if (*A == NULL)
-	{
-		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_MAJOR, "Error allocating memory to A");
-		return 1;
-	}
-
-	*B = (double*)malloc(sizeof(double) * N);
-	if (*B == NULL)
-	{
-		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_MAJOR, "Error allocating memory to B");
-		return 1;
-	}
-
-	*C = (double*)malloc(sizeof(double) * N);
-	if (*C == NULL)
-	{
-		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_MAJOR, "Error allocating memory to C");
-		return 1;
-	}
-
-	for (int i = 0; i < N; i++)
-	{
-		(*A)[i] = (double)((i + 1) * 1.0);
-		(*B)[i] = (double)((i - 1) * 1.0);
-		(*C)[i] = (double)((i) * 1.0);
-	}
-
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO, " aocl_allocate()");
-	return 0;
-}
-
-void sumV(double* A, double* B, double* C, int N)
-{
-	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
-	if ((A == NULL) || (B == NULL) || (C == NULL))
-	{
-		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_MAJOR, "Invalid Pointers");
-		return;
-	}
-	for (int i = 0; i < N; i++)
-	{
-		C[i] += A[i] + B[i];
-	}
-
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
-}
-
-int main(void)
-{
-	int status = 0;
-	double* A = NULL;
-	double* B = NULL;
-	double* C = NULL;
-
-	printf("Initializing\n");
-	AOCL_DTL_INITIALIZE(AOCL_DTL_LEVEL_ALL);
-
-	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
-
-	status = aocl_allocate(&A, &B, &C, 120);
-	if (status != 0)
-	{
-		printf("Error allocating memory\n");
-
-		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_CRITICAL, "Error in function aocl_allocate()");
-		AOCL_DTL_UNINITIALIZE();
-		exit(1);
-	}
-
-	sumV(A, B, C, 120);
-	
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
-	AOCL_DTL_UNINITIALIZE();
-
-	return 0;
-}
-#endif
+/*===================================================================
+ * File Name :  test_dtl.c
+ * 
+ * Description : Unit test cases for dtl.
+ *
+ * Copyright (C) 2020, Advanced Micro Devices, Inc
+ * 
+ *==================================================================*/
+
+#if 0 // Disable this for normal build.
+
+#include "aocltpdef.h"
+#include "aocldtl.h"
+
+int aocl_allocate(double**A, double** B, double** C, int N)
+{
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
+
+	*A = (double*)malloc(sizeof(double) * N);
+	if (*A == NULL)
+	{
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_MAJOR, "Error allocating memory to A");
+		return 1;
+	}
+
+	*B = (double*)malloc(sizeof(double) * N);
+	if (*B == NULL)
+	{
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_MAJOR, "Error allocating memory to B");
+		return 1;
+	}
+
+	*C = (double*)malloc(sizeof(double) * N);
+	if (*C == NULL)
+	{
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_MAJOR, "Error allocating memory to C");
+		return 1;
+	}
+
+	for (int i = 0; i < N; i++)
+	{
+		(*A)[i] = (double)((i + 1) * 1.0);
+		(*B)[i] = (double)((i - 1) * 1.0);
+		(*C)[i] = (double)((i) * 1.0);
+	}
+
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO, " aocl_allocate()");
+	return 0;
+}
+
+void sumV(double* A, double* B, double* C, int N)
+{
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
+	if ((A == NULL) || (B == NULL) || (C == NULL))
+	{
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_MAJOR, "Invalid Pointers");
+		return;
+	}
+	for (int i = 0; i < N; i++)
+	{
+		C[i] += A[i] + B[i];
+	}
+
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
+}
+
+int main(void)
+{
+	int status = 0;
+	double* A = NULL;
+	double* B = NULL;
+	double* C = NULL;
+
+	printf("Initializing\n");
+	AOCL_DTL_INITIALIZE(AOCL_DTL_LEVEL_ALL);
+
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
+
+	status = aocl_allocate(&A, &B, &C, 120);
+	if (status != 0)
+	{
+		printf("Error allocating memory\n");
+
+		AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_CRITICAL, "Error in function aocl_allocate()");
+		AOCL_DTL_UNINITIALIZE();
+		exit(1);
+	}
+
+	sumV(A, B, C, 120);
+	
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
+	AOCL_DTL_UNINITIALIZE();
+
+	return 0;
+}
+#endif
diff --git a/bench/Makefile b/bench/Makefile
index 0203d5a5b0..751f7129a5 100755
--- a/bench/Makefile
+++ b/bench/Makefile
@@ -303,4 +303,4 @@ endif
 clean: cleanx
 
 cleanx:
-	- $(RM_F) *.o *.x
\ No newline at end of file
+	- $(RM_F) *.o *.x
diff --git a/bench/bench_amaxv.c b/bench/bench_amaxv.c
index 2a0e578975..eb37319b6f 100644
--- a/bench/bench_amaxv.c
+++ b/bench/bench_amaxv.c
@@ -247,4 +247,4 @@ int main( int argc, char** argv )
     fclose(fout);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/bench/bench_aocl_gemm/Makefile b/bench/bench_aocl_gemm/Makefile
index 91b3a7b587..897a982ba3 100755
--- a/bench/bench_aocl_gemm/Makefile
+++ b/bench/bench_aocl_gemm/Makefile
@@ -106,7 +106,8 @@ CFLAGS         += -I$(TEST_SRC_PATH)
 all: blis
 
 blis: \
-       bench_lpgemm_blis.x
+       bench_lpgemm_blis.x \
+       bench_lpgemm_utils_blis.x
 
 
 # --Object file rules --
diff --git a/bench/bench_aocl_gemm/bench_input.txt b/bench/bench_aocl_gemm/bench_input.txt
index d8b8226a13..9034a0d550 100644
--- a/bench/bench_aocl_gemm/bench_input.txt
+++ b/bench/bench_aocl_gemm/bench_input.txt
@@ -1,3 +1,199 @@
+u r p 480 20 2050 2050 20 20
+u r p 481 20 2050 2050 20 20
+u r p 482 20 2050 2050 20 20
+u r p 483 20 2050 2050 20 20
+u r R 484 20 2050 2050 20 20
+u r R 485 20 2050 2050 20 20
+u r R 480 39 2050 2050 39 39
+u r R 481 39 2050 2050 39 39
+u r R 482 39 2050 2050 39 39
+u r R 483 39 2050 2050 39 39
+u r R 484 39 2050 2050 39 39
+u r p 485 39 2050 2050 39 39
+u r p 480 50 2050 2050 50 50
+u r p 481 50 2050 2050 50 50
+u r p 482 50 2050 2050 50 50
+u r p 483 50 2050 2050 50 50
+u r p 484 50 2050 2050 50 50
+u r p 485 50 2050 2050 50 50
+u r R 480 1108 2050 2050 1108 1108
+u r R 481 1108 2050 2050 1108 1108
+u r R 482 1108 2050 2050 1108 1108
+u r R 483 1108 2050 2050 1108 1108
+u r R 484 1108 2050 2050 1108 1108
+u r R 485 1108 2050 2050 1108 1108
+u r R 480 1127 2050 2050 1127 1127
+u r R 481 1127 2050 2050 1127 1127
+u r R 482 1127 2050 2050 1127 1127
+u r R 483 1127 2050 2050 1127 1127
+u r p 484 1127 2050 2050 1127 1127
+u r p 485 1127 2050 2050 1127 1127
+u r p 480 1138 2050 2050 1138 1138
+u r p 481 1138 2050 2050 1138 1138
+u r p 482 1138 2050 2050 1138 1138
+u r p 483 1138 2050 2050 1138 1138
+u r p 484 1138 2050 2050 1138 1138
+u r p 485 1138 2050 2050 1138 1138
+u r p 1 1 3 3 1 1
+u r p 1 9 3 3 9 9
+u r p 1 2048 3 3 2048 2048
+u r p 1 2048 5192 5192 2048 2048
+u r p 9 1 3 3 1 1
+u r p 576 1 3500 3500 1 1
+u r p 1 1 1 1 1 1
+u r p 102 1088 1024 1024 1088 1088
+u r p 102 2048 1024 1024 2048 2048
+u r p 485 656 1024 1024 656 656
+u r p 483 656 1024 1024 656 656
+u r p 81 128 3 3 128 128
+u r p 1022 512 515 515 512 512
+u r p 74 512 515 515 512 512
+u r p 253 2048 515 515 2048 2048
+u r p 8192 1040 515 515 1040 1040
+u r p 10 1029 515 515 1029 1029
+u r p 24 1040 2050 2050 1040 1040
+u r p 1024 1029 2050 2050 1029 1029
+u r p 480 660 2050 2050 660 660
+u r p 481 660 2050 2050 660 660
+u r p 482 660 2050 2050 660 660
+u r p 483 660 2050 2050 660 660
+u r p 484 660 2050 2050 660 660
+u r p 485 660 2050 2050 660 660
+u r p 480 679 2050 2050 679 679
+u r p 481 679 2050 2050 679 679
+u r p 482 679 2050 2050 679 679
+u r p 483 679 2050 2050 679 679
+u r p 484 679 2050 2050 679 679
+u r p 485 679 2050 2050 679 679
+u r p 480 690 2050 2050 690 690
+u r p 481 690 2050 2050 690 690
+u r p 482 690 2050 2050 690 690
+u r p 483 690 2050 2050 690 690
+u r p 484 690 2050 2050 690 690
+u r p 485 690 2050 2050 690 690
+u r p 480 660 2048 2048 660 660
+u r p 481 660 2048 2048 660 660
+u r p 482 660 2048 2048 660 660
+u r p 483 660 2048 2048 660 660
+u r p 484 660 2048 2048 660 660
+u r p 485 660 2048 2048 660 660
+u r p 480 679 2048 2048 679 679
+u r p 481 679 2048 2048 679 679
+u r p 482 679 2048 2048 679 679
+u r p 483 679 2048 2048 679 679
+u r p 484 679 2048 2048 679 679
+u r p 485 679 2048 2048 679 679
+u r p 480 690 2048 2048 690 690
+u r p 481 690 2048 2048 690 690
+u r p 482 690 2048 2048 690 690
+u r p 483 690 2048 2048 690 690
+u r p 484 690 2048 2048 690 690
+u r p 485 690 2048 2048 690 690
+u r p 480 656 1024 1024 656 656
+u r p 480 128 3 3 128 128
+u r p 1024 512 515 515 512 512
+u r p 1024 2048 1024 1024 2048 2048
+u r p 1024 2048 515 515 2048 2048
+u r p 1024 1040 515 515 1040 1040
+u r p 5 1029 515 515 1029 1029
+u r p 1024 1029 515 515 1029 1029
+u r p 1024 1040 2050 2050 1040 1040
+u r p 1029 1029 2050 2050 1029 1029
+u r R 480 646 2050 2050 646 646
+u r R 481 646 2050 2050 646 646
+u r R 482 646 2050 2050 646 646
+u r R 483 646 2050 2050 646 646
+u r R 484 646 2050 2050 646 646
+u r R 485 646 2050 2050 646 646
+u r R 481 656 2050 2050 656 656
+u r R 482 656 2050 2050 656 656
+u r R 483 656 2050 2050 656 656
+u r R 484 656 2050 2050 656 656
+u r p 485 656 2050 2050 656 656
+u r p 480 672 2050 2050 672 672
+u r p 481 672 2050 2050 672 672
+u r p 482 672 2050 2050 672 672
+u r p 483 672 2050 2050 672 672
+u r p 484 672 2050 2050 672 672
+u r p 485 672 2050 2050 672 672
+u r p 480 688 2050 2050 688 688
+u r p 481 688 2050 2050 688 688
+u r r 482 688 2050 2050 688 688
+u r r 483 688 2050 2050 688 688
+u r r 484 688 2050 2050 688 688
+u r r 485 688 2050 2050 688 688
+u r r 1024 512 64 64 512 512
+u r r 16 256 512 512 256 256
+u r r 480 640 512 512 640 640
+u r r 64 768 512 512 768 768
+u r r 128 128 128 128 128 128
+u r r 1024 64 512 512 64 64
+u r r 1024 256 32 32 256 256
+u r r 1024 512 64 64 512 512
+u r r 480 640 512 512 640 640
+u r p 1024 32 256 256 32 32
+u r P 1024 64 512 512 64 64
+u r P 64 800 320 320 800 800
+u r P 64 768 512 512 768 768
+u r P 16 256 512 512 256 256
+u r P 128 128 128 128 128 128
+u r P 256 512 256 256 512 512
+u r P 1024 1024 1024 1024 1024 1024
+u r P 480 640 1024 1024 640 640
+u r P 480 640 256 256 640 640
+u r P 8 64 32 32 64 64
+u r P 9 64 32 32 64 64
+u r P 10 128 64 64 128 128
+u r P 8 8 8 8 8 8
+u r P 12 12 12 12 12 12
+u r P 25 25 25 25 25 25
+u r P 25 25 20 20 25 25
+u r r 4096 256 5 5 256 256
+u r r 3000 256 128 128 256 256
+u r r 4096 1024 512 512 1024 1024
+u r r 144 256 5 5 256 256
+u r r 144 256 128 128 256 256
+u r r 144 1024 512 512 1024 1024
+u r r 480 688 256 256 688 688
+u r r 480 640 512 512 640 640
+u r r 480 640 1024 1024 640 640
+u r r 64 800 320 320 800 800
+u r r 64 768 512 512 768 768
+u r r 16 256 512 512 256 256
+u r r 128 128 128 128 128 128
+u r r 256 512 256 256 512 512
+u r r 1024 1024 1024 1024 1024 1024
+u r r 1024 32 256 256 32 32
+u r r 1024 64 512 512 64 64
+u r r 1024 256 32 32 256 256
+u r r 1024 512 64 64 512 512
+u r r 512 32 256 256 32 32
+u r r 512 768 512 512 768 768
+u r r 512 256 32 32 256 256
+u r r 512 512 64 64 512 512
+u r r 512 256 768 768 256 256
+u r r 768 768 1024 1024 768 768
+u r r 768 768 768 768 768 768
+u r r 2048 2048 2048 2048 2048 2048
+u r r 4096 4096 4096 4096 4096 4096
+f c p 2482 1127 2050 2482 2050 2482
+f c p 2483 1127 2050 2483 2050 2483
+f c p 2484 1127 2050 2484 2050 2484
+f c p 2485 1127 2050 2485 2050 2485
+f c p 480 1138 2050 480 2050 480
+f c p 481 1138 2050 481 2050 481
+f c p 482 1138 2050 482 2050 482
+f c p 483 1138 2050 483 2050 483
+f c p 484 1138 2050 484 2050 484
+f c p 485 1138 2050 485 2050 485
+f c p 1 1 3 1 3 1
+f c p 1 9 3 1 3 1
+f c p 1 2048 3 1 3 1
+f c p 1 2048 5192 1 5192 1
+f c p 9 1 3 9 3 9
+f c p 576 1 3500 576 3500 576
+f c p 1 1 1 1 1 1
+f c p 102 1088 1024 102 1024 102
 b r r 480 20 2050 2050 20 20
 b r r 481 20 2050 2050 20 20
 b r r 482 20 2050 2050 20 20
diff --git a/bench/bench_aocl_gemm/bench_lpgemm.c b/bench/bench_aocl_gemm/bench_lpgemm.c
index 92b7a7a1a6..7dd049b159 100644
--- a/bench/bench_aocl_gemm/bench_lpgemm.c
+++ b/bench/bench_aocl_gemm/bench_lpgemm.c
@@ -1,3 +1,37 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -5,6 +39,7 @@
 #include <time.h>
 #include <float.h>
 #include <unistd.h>
+#include <math.h>
 
 #include "blis.h"
 
@@ -21,11 +56,39 @@ int32_t global_n_repeat = 0;
 
 char global_dscale_out = 'n';
 
+dim_t num_eltwise = 0; // To keep track of eltwise operations.
+
 #define _XSTR(str) #str
 #define XSTR(str) _XSTR(str)
 
 #define GEN_FUNC_NAME(prototype,ctype) prototype ## ctype
 
+inline void float_to_bf16( float* float_value, bfloat16* bf16_val )
+{
+	/*Set offset 2 to copy most significant 2 bytes of float
+	to convert float values to bf16 values*/
+	memcpy( ( bf16_val ), (char *)( float_value ) + 2, sizeof ( bfloat16 ) );
+}
+
+inline float bf16_to_float
+     (
+       bfloat16 bf16_val
+     )
+{
+	int32_t inter_temp = *( ( int16_t* ) &bf16_val );
+	inter_temp = inter_temp << 16;
+	float float_value = *( float* ) ( &inter_temp );
+	return float_value;
+}
+
+inline void convert_float_arr_to_bf16( float* array, bfloat16* array_bf16, int size )
+{
+	for (int i=0; i< size; i++)
+	{
+		float_to_bf16( ( array + i ), ( array_bf16 + i ) );
+	}
+}
+
 #define GEN_FILL_ARRAY_FUNC(ctype) \
 void fill_array_ ## ctype ( void* arr, dim_t size ) \
 { \
@@ -38,21 +101,21 @@ void fill_array_ ## ctype ( void* arr, dim_t size ) \
 
 GEN_FILL_ARRAY_FUNC(uint8_t)
 GEN_FILL_ARRAY_FUNC(int8_t)
+GEN_FILL_ARRAY_FUNC(int16_t)
 GEN_FILL_ARRAY_FUNC(float)
 GEN_FILL_ARRAY_FUNC(int32_t)
 
-inline void float_to_bf16( float* float_value, bfloat16* bf16_val )
+void fill_array_bfloat16( void* arr, dim_t size )
 {
-	/*Set offset 2 to copy most significant 2 bytes of float 
-	to convert float values to bf16 values*/
-	memcpy( ( bf16_val ), (char *)( float_value ) + 2, sizeof ( bfloat16 ) );
-}
-
-inline void convert_float_arr_to_bf16( float* array, bfloat16* array_bf16, int size )
-{
-	for (int i=0; i< size; i++)
+	float* c_float = ( float* ) bli_malloc_user( sizeof( float ) * size );
+	for ( dim_t i = 0; i < size; ++i )
 	{
-		float_to_bf16( ( array + i ), ( array_bf16 + i ) );
+		c_float[i] = 2.0;
+	}
+	convert_float_arr_to_bf16( c_float, arr, size );
+	if ( c_float != NULL )
+	{
+		bli_free_user( c_float );
 	}
 }
 
@@ -178,6 +241,10 @@ GEN_BLIS_MAT_MUL_FUNC(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
 GEN_BLIS_MAT_MUL_FUNC(bfloat16,bfloat16,float,float,bf16bf16f32of32)
 GEN_BLIS_MAT_MUL_FUNC(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 GEN_BLIS_MAT_MUL_FUNC(float,float,float,float,f32f32f32of32)
+GEN_BLIS_MAT_MUL_FUNC(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
+GEN_BLIS_MAT_MUL_FUNC(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
+GEN_BLIS_MAT_MUL_FUNC(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
+GEN_BLIS_MAT_MUL_FUNC(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
 
 double get_gflops
      (
@@ -234,7 +301,7 @@ void mat_mul_bench_driver_ ## BLAS_SFX \
 	{ \
 		if ( bench_mode == 'a' ) \
 		{ \
-			memset( ( void* ) c, 0, sizeof( C_type ) * m * n ); \
+			GEN_FUNC_NAME(fill_array_,C_type)( c, ( m * n ) ); \
 		} \
  \
 		struct timespec tstart={0,0}, tend={0,0}; \
@@ -269,6 +336,10 @@ GEN_MAT_MUL_BENCH_DRV_FUNC(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
 GEN_MAT_MUL_BENCH_DRV_FUNC(bfloat16,bfloat16,float,float,bf16bf16f32of32)
 GEN_MAT_MUL_BENCH_DRV_FUNC(bfloat16,bfloat16,bfloat16,float,bf16bf16f32obf16)
 GEN_MAT_MUL_BENCH_DRV_FUNC(float,float,float,float,f32f32f32of32)
+GEN_MAT_MUL_BENCH_DRV_FUNC(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
+GEN_MAT_MUL_BENCH_DRV_FUNC(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
+GEN_MAT_MUL_BENCH_DRV_FUNC(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
+GEN_MAT_MUL_BENCH_DRV_FUNC(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
 
 int max (int a, int b)
 {
@@ -280,33 +351,32 @@ int min (int a, int b)
 	return ( a < b ? a : b );
 }
 
-#define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(C_type,ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \
-inline C_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX \
+#define GEN_MAT_MUL_ACC_CHK_DOWNSCALE(ACCUM_type,SCALE_type,BLAS_DOWNSCALE_SFX) \
+inline ACCUM_type mat_mul_accuracy_check_downscale_ ## BLAS_DOWNSCALE_SFX \
      (\
        ACCUM_type temp_accum,\
-       C_type out_temp_accum, \
        aocl_post_op*  post_op, \
        dim_t j \
      )\
 {\
-	out_temp_accum = ( C_type ) min ( max ( nearbyintf( ( SCALE_type )temp_accum * \
+	ACCUM_type out_temp_accum = ( ACCUM_type ) min ( max ( nearbyintf( ( SCALE_type )temp_accum * \
 		( *( ( SCALE_type* )post_op->sum.scale_factor + j ) ) ), S8_MIN ), S8_MAX ) ; \
 	return 	out_temp_accum; \
 }\
 
-GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int16_t,float,u8s8s16os8)
-GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int8_t,int32_t,float,u8s8s32os8)
+GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int16_t,float,u8s8s16os8)
+GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int32_t,float,u8s8s32os8)
+GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int32_t,float,s8s8s32os8)
+GEN_MAT_MUL_ACC_CHK_DOWNSCALE(int16_t,float,s8s8s16os8)
 
-inline bfloat16 mat_mul_accuracy_check_downscale_bf16bf16f32obf16
+inline float mat_mul_accuracy_check_downscale_bf16bf16f32obf16
      (
-       float temp_accum, 
-       bfloat16 out_temp_accum, 
-       aocl_post_op*  post_op, 
+       float temp_accum,
+       aocl_post_op*  post_op,
        dim_t j
      )
 {
-	float_to_bf16( ( &temp_accum ), ( &out_temp_accum ) );
-	return out_temp_accum;
+	return temp_accum;
 }
 
 #define GEN_MAT_MUL_ACC_CHK_ACCUM(A_type, B_type, C_type,ACCUM_type,BLAS_SFX) \
@@ -345,77 +415,167 @@ GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,int16_t,int16_t,u8s8s16os16)
 GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,int8_t,int32_t,u8s8s32os8)
 GEN_MAT_MUL_ACC_CHK_ACCUM(uint8_t,int8_t,int32_t,int32_t,u8s8s32os32)
 GEN_MAT_MUL_ACC_CHK_ACCUM(float,float,float,float,f32f32f32of32)
+GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int8_t,int32_t,s8s8s32os8)
+GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int32_t,int32_t,s8s8s32os32)
+GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int8_t,int16_t,s8s8s16os8)
+GEN_MAT_MUL_ACC_CHK_ACCUM(int8_t,int8_t,int16_t,int16_t,s8s8s16os16)
 
-inline float bf16_to_float 
-     (
-       bfloat16 bf16_val
-     )
-{
-	int32_t inter_temp = *( ( int16_t* ) &bf16_val );
-	inter_temp = inter_temp << 16; 
-	float float_value = *( float* ) ( &inter_temp );
-	return float_value;
-}
-	
 inline float mat_mul_accuracy_check_accum_bf16bf16f32of32
      (
-       bfloat16* a, 
-       bfloat16* b, 
-       float* c_ref, 
+       bfloat16* a,
+       bfloat16* b,
+       float* c_ref,
        float temp_accum,
-       float  alpha, 
-       float beta, 
-       dim_t rs_a, 
+       float  alpha,
+       float beta,
+       dim_t rs_a,
        dim_t rs_b,
-       dim_t cs_a, 
-       dim_t cs_b, 
+       dim_t cs_a,
+       dim_t cs_b,
        dim_t rs_c_ref,
-       dim_t cs_c_ref, 
-       dim_t i, 
-       dim_t j, 
-       dim_t k 
+       dim_t cs_c_ref,
+       dim_t i,
+       dim_t j,
+       dim_t k
      )
 {
-	for ( dim_t p = 0; p < k; ++p) 
-	{ 
-		float a_float = bf16_to_float( *( a + i * rs_a + p * cs_a ) ); 
-		float b_float = bf16_to_float( *( b + p * rs_b + j * cs_b ) ); 
-		temp_accum += ( ( a_float ) * ( b_float ) ); 
-	} 
-	temp_accum = ( beta * ( * (c_ref + ( rs_c_ref * i ) + ( cs_c_ref * j ) ) ) ) 
-			             + ( alpha * temp_accum ); 
-	return temp_accum; 					 
+	for ( dim_t p = 0; p < k; ++p)
+	{
+		float a_float = bf16_to_float( *( a + i * rs_a + p * cs_a ) );
+		float b_float = bf16_to_float( *( b + p * rs_b + j * cs_b ) );
+		temp_accum += ( ( a_float ) * ( b_float ) );
+	}
+	temp_accum = ( beta * ( * (c_ref + ( rs_c_ref * i ) + ( cs_c_ref * j ) ) ) )
+			             + ( alpha * temp_accum );
+	return temp_accum;
 }
 
 inline float mat_mul_accuracy_check_accum_bf16bf16f32obf16
      (
-       bfloat16* a, 
-       bfloat16* b, 
-       bfloat16* c_ref, 
+       bfloat16* a,
+       bfloat16* b,
+       bfloat16* c_ref,
        float temp_accum,
-       float  alpha, 
-       float beta, 
-       dim_t rs_a, 
+       float  alpha,
+       float beta,
+       dim_t rs_a,
        dim_t rs_b,
-       dim_t cs_a, 
-       dim_t cs_b, 
+       dim_t cs_a,
+       dim_t cs_b,
        dim_t rs_c_ref,
-       dim_t cs_c_ref,  
-       dim_t i, 
-       dim_t j, 
-       dim_t k  
+       dim_t cs_c_ref,
+       dim_t i,
+       dim_t j,
+       dim_t k
      )
 {
-	for ( dim_t p = 0; p < k; ++p) 
-	{ 
-		float a_float = bf16_to_float( *( a + i*rs_a + p*cs_a ) ); 
-		float b_float = bf16_to_float( *( b + p*rs_b + j*cs_b ) ); 
-		temp_accum += ( ( a_float ) * ( b_float ) ); 
-	}  
-	float c_ref_float = bf16_to_float( *( c_ref + i*rs_c_ref + j*cs_c_ref ) ); 
-	temp_accum = ( beta * ( c_ref_float ) ) + ( alpha * temp_accum ); 
-
-	return temp_accum; 
+	for ( dim_t p = 0; p < k; ++p)
+	{
+		float a_float = bf16_to_float( *( a + i*rs_a + p*cs_a ) );
+		float b_float = bf16_to_float( *( b + p*rs_b + j*cs_b ) );
+		temp_accum += ( ( a_float ) * ( b_float ) );
+	}
+	float c_ref_float = bf16_to_float( *( c_ref + i*rs_c_ref + j*cs_c_ref ) );
+	temp_accum = ( beta * ( c_ref_float ) ) + ( alpha * temp_accum );
+
+	return temp_accum;
+}
+
+#define GEN_GELU_TANH_POSTOP_INT(ACCUM_type,BLAS_SFX) \
+inline ACCUM_type GELU_TANH_post_op_ ## BLAS_SFX \
+     (\
+       ACCUM_type temp_accum \
+     )\
+{\
+	float gelu_reference = 0.5 *(double)temp_accum * (1 + tanhf( 0.797884 * ( (double)temp_accum + \
+					( 0.044715 * ((double)temp_accum * (double)temp_accum * \
+					(double)temp_accum ) ) ) ) ); \
+	temp_accum = round (gelu_reference); \
+	return temp_accum; \
+}\
+
+GEN_GELU_TANH_POSTOP_INT(int16_t,u8s8s16os8)
+GEN_GELU_TANH_POSTOP_INT(int16_t,u8s8s16os16)
+GEN_GELU_TANH_POSTOP_INT(int32_t,u8s8s32os8)
+GEN_GELU_TANH_POSTOP_INT(int32_t,u8s8s32os32)
+GEN_GELU_TANH_POSTOP_INT(int32_t,s8s8s32os8)
+GEN_GELU_TANH_POSTOP_INT(int32_t,s8s8s32os32)
+GEN_GELU_TANH_POSTOP_INT(int16_t,s8s8s16os8)
+GEN_GELU_TANH_POSTOP_INT(int16_t,s8s8s16os16)
+
+#define GEN_GELU_TANH_POSTOP_FLOAT(BLAS_SFX) \
+inline float GELU_TANH_post_op_ ## BLAS_SFX \
+     (\
+       float temp_accum \
+     )\
+{\
+	temp_accum = 0.5 *(double)temp_accum * (1 + tanhf( 0.797884 * ( (double)temp_accum + \
+	              ( 0.044715 * ((double)temp_accum * (double)temp_accum * \
+				  (double)temp_accum ) ) ) ) ); \
+	return temp_accum; \
+}\
+
+GEN_GELU_TANH_POSTOP_FLOAT(f32f32f32of32)
+GEN_GELU_TANH_POSTOP_FLOAT(bf16bf16f32of32)
+GEN_GELU_TANH_POSTOP_FLOAT(bf16bf16f32obf16)
+
+#define GEN_GELU_ERF_POSTOP_INT(ACCUM_type,BLAS_SFX) \
+inline ACCUM_type GELU_ERF_post_op_ ## BLAS_SFX \
+     (\
+       ACCUM_type temp_accum \
+     )\
+{\
+	float gelu_reference = 0.5 *(double)temp_accum * (1 + erff( (double)temp_accum * 0.707107 )); \
+	temp_accum = round (gelu_reference); \
+	return temp_accum; \
+}\
+
+GEN_GELU_ERF_POSTOP_INT(int16_t,u8s8s16os8)
+GEN_GELU_ERF_POSTOP_INT(int16_t,u8s8s16os16)
+GEN_GELU_ERF_POSTOP_INT(int32_t,u8s8s32os8)
+GEN_GELU_ERF_POSTOP_INT(int32_t,u8s8s32os32)
+GEN_GELU_ERF_POSTOP_INT(int32_t,s8s8s32os8)
+GEN_GELU_ERF_POSTOP_INT(int32_t,s8s8s32os32)
+GEN_GELU_ERF_POSTOP_INT(int16_t,s8s8s16os8)
+GEN_GELU_ERF_POSTOP_INT(int16_t,s8s8s16os16)
+
+#define GEN_GELU_ERF_POSTOP_FLOAT(BLAS_SFX) \
+inline float GELU_ERF_post_op_ ## BLAS_SFX \
+     (\
+       float temp_accum \
+     )\
+{\
+	temp_accum = 0.5 *(double)temp_accum * (1 + erff( (double)temp_accum * 0.707107 )); \
+	return temp_accum; \
+}\
+
+GEN_GELU_ERF_POSTOP_FLOAT(f32f32f32of32)
+GEN_GELU_ERF_POSTOP_FLOAT(bf16bf16f32of32)
+GEN_GELU_ERF_POSTOP_FLOAT(bf16bf16f32obf16)
+
+#define GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(C_type, ACCUM_type) \
+void mat_mul_get_output_type_val ## ACCUM_type ## C_type \
+     ( \
+       C_type* out_temp_accum, \
+       ACCUM_type* temp_accum \
+     ) \
+{ \
+	( *out_temp_accum ) = ( C_type )( *temp_accum ); \
+} \
+
+GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int32_t,int32_t)
+GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int8_t,int32_t)
+GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int16_t,int16_t)
+GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(int8_t,int16_t)
+GEN_MAT_MUL_GET_OUTPUT_TYPE_VALUE(float,float)
+
+void mat_mul_get_output_type_valfloatbfloat16
+     (
+       bfloat16* out_temp_accum,
+       float* temp_accum
+     )
+{
+	float_to_bf16( temp_accum, out_temp_accum );
 }
 
 #define GEN_MAT_MUL_ACC_CHK_DRV_FUNC(A_type,B_type,C_type,ACCUM_type,SCALE_type,BLAS_SFX,BLAS_DOWNSCALE_SFX) \
@@ -472,59 +632,76 @@ void mat_mul_accuracy_check_driver_ ## BLAS_SFX \
 \
 			if ( post_op != NULL ) \
 			{ \
-				/* Apply bias followed by relu. */ \
-				if ( post_op->seq_vector[0] == BIAS ) \
+				dim_t ele_i = 0; \
+				for ( dim_t op_id = 0; op_id < post_op->seq_length; ++op_id ) \
 				{ \
-					if ( post_op->seq_length >= 1 ) \
+					if ( post_op->seq_vector[op_id] == BIAS ) \
 					{ \
 						temp_accum += ( *( ( ACCUM_type* )post_op->bias.bias + j ) ); \
 					} \
-					if ( ( post_op->seq_length > 1 ) && \
-						 ( post_op->seq_vector[1] == ELTWISE ) ) \
+					else if ( post_op->seq_vector[op_id] == ELTWISE ) \
 					{ \
-						if ( post_op->eltwise.algo.alpha != NULL ) /* PReLU*/ \
+						if ( ( post_op->eltwise + ele_i )->algo.algo_type == \
+								PRELU ) /* PReLU*/ \
 						{ \
 							temp_accum = ( temp_accum > 0 ) ? \
 								temp_accum : \
 								( temp_accum * \
-								*( ( ACCUM_type* ) post_op->eltwise.algo.alpha ) ); \
+								*( ( ACCUM_type* ) ( post_op->eltwise + ele_i )->algo.alpha ) ); \
+							ele_i += 1; \
 						} \
-						else \
+						else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \
+								GELU_TANH ) /* TANH GeLU*/ \
 						{ \
-							temp_accum = ( temp_accum > 0 ) ? temp_accum : 0 ; \
+							temp_accum = GEN_FUNC_NAME(GELU_TANH_post_op_,BLAS_SFX) (temp_accum);\
+							ele_i += 1; \
 						} \
-					} \
-				} \
-				else if ( post_op->seq_vector[0] == ELTWISE ) \
-				{ \
-					if ( post_op->seq_length >= 1 ) \
-					{ \
-						if ( post_op->eltwise.algo.alpha != NULL ) /* PReLU*/ \
+						else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \
+								GELU_ERF ) /* ERF GeLU*/ \
 						{ \
-							temp_accum = ( temp_accum > 0 ) ? \
-									temp_accum : \
-									( temp_accum * *( ( ACCUM_type* ) post_op->eltwise.algo.alpha ) ); \
+							temp_accum = GEN_FUNC_NAME(GELU_ERF_post_op_,BLAS_SFX) (temp_accum);\
+							ele_i += 1; \
 						} \
-						else \
+						else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \
+								RELU ) /* ReLU*/ \
 						{ \
 							temp_accum = ( temp_accum > 0 ) ? temp_accum : 0 ; \
+							ele_i += 1; \
+						} \
+						else if ( ( post_op->eltwise + ele_i )->algo.algo_type == \
+								CLIP ) /* CLIP*/ \
+						{ \
+							temp_accum = \
+								min \
+								( \
+								  max \
+								  ( \
+									temp_accum, \
+									*( ( ACCUM_type* ) \
+									   ( post_op->eltwise + ele_i )->algo.alpha ) \
+								  ), \
+								  *( ( ACCUM_type* ) \
+									 ( post_op->eltwise + ele_i )->algo.beta) \
+								); \
+							ele_i += 1; \
 						} \
+						else \
+						{} \
 					} \
-					if ( ( post_op->seq_length > 1 ) && ( post_op->seq_vector[1] == BIAS ) ) \
+					else if ( post_op->seq_vector[op_id] == SCALE ) \
 					{ \
-						temp_accum += ( *( ( ACCUM_type* )post_op->bias.bias + j ) ); \
+						temp_accum = GEN_FUNC_NAME(mat_mul_accuracy_check_downscale_,BLAS_DOWNSCALE_SFX) \
+							(temp_accum, post_op, j); \
 					} \
+					else \
+					{} \
 				} \
 			} \
-			if ( global_dscale_out == 'y' ) \
-			{ \
-				out_temp_accum = GEN_FUNC_NAME(mat_mul_accuracy_check_downscale_,BLAS_DOWNSCALE_SFX) \
-				        (temp_accum, out_temp_accum, post_op, j); \
-			} \
-			else \
-			{ \
-				out_temp_accum = ( C_type )temp_accum; \
-			} \
+			/* Need to convert to downscaled type if required.*/ \
+			mat_mul_get_output_type_val ## ACCUM_type ## C_type \
+			( \
+			  &out_temp_accum, &temp_accum \
+			); \
  \
 			if ( *( c + ( rs_c * i ) + ( cs_c * j ) ) != out_temp_accum ) \
 			{ \
@@ -535,7 +712,7 @@ void mat_mul_accuracy_check_driver_ ## BLAS_SFX \
 									XSTR(BLAS_SFX), m, n, k, lda, ldb, ldc ); \
 					fflush( fout ); \
 				} \
-				printf("failure, m: %ld, n: %ld, k: %ld\n", i, j, k ); \
+				printf("failure, m: %ld, n: %ld, k: %ld\n", i, j, k); \
 				goto cleanup_acc; \
 			} \
 		} \
@@ -550,7 +727,11 @@ GEN_MAT_MUL_ACC_CHK_DRV_FUNC(uint8_t,int8_t,int32_t,int32_t,float,u8s8s32os32,u8
 GEN_MAT_MUL_ACC_CHK_DRV_FUNC(uint8_t,int8_t,int8_t,int32_t,float,u8s8s32os8,u8s8s32os8)
 GEN_MAT_MUL_ACC_CHK_DRV_FUNC(bfloat16,bfloat16,float,float,float,bf16bf16f32of32,bf16bf16f32obf16)
 GEN_MAT_MUL_ACC_CHK_DRV_FUNC(bfloat16,bfloat16,bfloat16,float,float,bf16bf16f32obf16,bf16bf16f32obf16)
-GEN_MAT_MUL_ACC_CHK_DRV_FUNC(float,float,float,float,float,f32f32f32of32,bf16bf16f32obf16) 
+GEN_MAT_MUL_ACC_CHK_DRV_FUNC(float,float,float,float,float,f32f32f32of32,bf16bf16f32obf16)
+GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int32_t,int32_t,float,s8s8s32os32,s8s8s32os8)
+GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int8_t,int32_t,float,s8s8s32os8,s8s8s32os8)
+GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int16_t,int16_t,float,s8s8s16os16,s8s8s16os8)
+GEN_MAT_MUL_ACC_CHK_DRV_FUNC(int8_t,int8_t,int8_t,int16_t,float,s8s8s16os8,s8s8s16os8)
 
 /* Only supports bias followed by RELU and vice versa for now.*/ \
 #define GEN_MAT_MUL_POST_OPS_CREATOR(C_type,DSCALE_type,BLAS_SFX) \
@@ -569,8 +750,8 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \
 		return NULL; \
 	} \
  \
-	/* Only supporting 3 post ops at max for now.*/ \
-	dim_t max_post_ops_seq_length = 3; \
+	/* Only supporting 5 post ops at max for now.*/ \
+	dim_t max_post_ops_seq_length = 5; \
 	post_ops->seq_vector = ( AOCL_POST_OP_TYPE* ) \
 							malloc \
 							( \
@@ -587,30 +768,79 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \
 	/* Parse post ops list.*/ \
 	dim_t cur_op_index = 0; \
 	/* Ensure the buffers that use NULL check in deinit code is properly set to NULL.*/ \
-	post_ops->eltwise.algo.alpha = NULL; \
+	post_ops->eltwise = NULL; \
 	post_ops->bias.bias = NULL; \
 	post_ops->sum.scale_factor = NULL; \
 	if ( post_ops_str != NULL ) \
 	{ \
 		char* ops_tok = strtok(post_ops_str, ", " ); \
+		bool is_relu = FALSE; \
 		bool is_param_relu = FALSE; \
+		bool is_gelu_tanh = FALSE; \
+		bool is_gelu_erf = FALSE; \
+		bool is_clip = FALSE; \
+		dim_t activator_idx = 0; \
+		dim_t clip_idx = 0; \
+ \
+		/* Ensure only one activator is used as an eltwise post-op.*/ \
+		bool is_activator_set = FALSE; \
+		num_eltwise = 0; \
 		while ( ops_tok ) \
 		{ \
 			if ( strcmp( ops_tok, "bias") == 0 ) \
 			{ \
 				post_ops->seq_vector[cur_op_index] = BIAS; \
+				cur_op_index++; \
 			} \
-			else if ( strcmp( ops_tok, "relu") == 0 ) \
+			else if ( ( strcmp( ops_tok, "relu") == 0 ) && \
+					  ( is_activator_set == FALSE ) ) \
 			{ \
 				post_ops->seq_vector[cur_op_index] = ELTWISE; \
+				is_relu = TRUE; \
+				is_activator_set = TRUE; \
+				num_eltwise += 1; \
+				activator_idx = cur_op_index; \
+				cur_op_index++; \
 			} \
-			else if ( strcmp( ops_tok, "prelu") == 0 ) \
+			else if ( ( strcmp( ops_tok, "prelu") == 0 ) && \
+					  ( is_activator_set == FALSE ) ) \
 			{ \
 				post_ops->seq_vector[cur_op_index] = ELTWISE; \
 				is_param_relu = TRUE; \
+				is_activator_set = TRUE; \
+				num_eltwise += 1; \
+				activator_idx = cur_op_index; \
+				cur_op_index++; \
+			} \
+			else if ( ( strcmp( ops_tok, "gelu_tanh") == 0 ) && \
+					  ( is_activator_set == FALSE ) ) \
+			{ \
+				post_ops->seq_vector[cur_op_index] = ELTWISE; \
+				is_gelu_tanh = TRUE; \
+				is_activator_set = TRUE; \
+				num_eltwise += 1; \
+				activator_idx = cur_op_index; \
+				cur_op_index++; \
+			} \
+			else if ( ( strcmp( ops_tok, "gelu_erf") == 0 ) && \
+					  ( is_activator_set == FALSE ) ) \
+			{ \
+				post_ops->seq_vector[cur_op_index] = ELTWISE; \
+				is_gelu_erf = TRUE; \
+				is_activator_set = TRUE; \
+				num_eltwise += 1; \
+				activator_idx = cur_op_index; \
+				cur_op_index++; \
+			} \
+			else if ( strcmp( ops_tok, "clip") == 0 ) \
+			{ \
+				post_ops->seq_vector[cur_op_index] = ELTWISE; \
+				is_clip = TRUE; \
+				num_eltwise += 1; \
+				clip_idx = cur_op_index; \
+				cur_op_index++; \
 			} \
 			ops_tok = strtok( NULL, ", " ); \
-			cur_op_index++; \
 		} \
  \
 		/* Allocate bias buffer, return early if alloc fails.*/ \
@@ -623,17 +853,80 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \
 		} \
 		GEN_FUNC_NAME(fill_array_post_ops_,C_type)( post_ops->bias.bias, n ); \
  \
-		post_ops->eltwise.is_power_of_2 = FALSE; \
-		post_ops->eltwise.scale_factor = NULL; \
-		post_ops->eltwise.algo.alpha = NULL; \
-		post_ops->eltwise.algo.algo_type = RELU; \
-		if ( is_param_relu == TRUE ) \
+		post_ops->eltwise = malloc( num_eltwise * sizeof( aocl_post_op_eltwise ) ); \
+		if ( post_ops->eltwise == NULL ) \
+		{ \
+			free( post_ops->bias.bias ); \
+			free( post_ops->seq_vector ); \
+			free( post_ops ); \
+			return NULL; \
+		} \
+ \
+		if ( num_eltwise > 0 ) \
+		{ \
+			if ( num_eltwise > 1 ) \
+			{ \
+				if ( activator_idx < clip_idx ) \
+				{ \
+					activator_idx = 0; \
+					clip_idx = 1; \
+				} \
+				else \
+				{ \
+					activator_idx = 1; \
+					clip_idx = 0; \
+				} \
+			} \
+			else \
+			{ \
+			   activator_idx = 0; \
+			   clip_idx = 0; \
+			} \
+		} \
+		/* Only one of relu,prelu,gelu_tanh,gelu_erf allowed as an activator.*/ \
+		if ( is_relu == TRUE ) \
+		{ \
+			( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \
+			( post_ops->eltwise + activator_idx )->scale_factor = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.beta = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.algo_type = RELU; \
+		} \
+		else if ( is_param_relu == TRUE ) \
+		{ \
+			( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \
+			( post_ops->eltwise + activator_idx )->scale_factor = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.beta = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.alpha = malloc( sizeof( C_type ) ); \
+			*( ( C_type* ) ( post_ops->eltwise + activator_idx )->algo.alpha ) = ( C_type )6; \
+			( post_ops->eltwise + activator_idx )->algo.algo_type = PRELU; \
+		} \
+		else if ( is_gelu_tanh == TRUE ) \
+		{ \
+			( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \
+			( post_ops->eltwise + activator_idx )->scale_factor = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.beta = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.algo_type = GELU_TANH; \
+		} \
+		else if ( is_gelu_erf == TRUE ) \
+		{ \
+			( post_ops->eltwise + activator_idx )->is_power_of_2 = FALSE; \
+			( post_ops->eltwise + activator_idx )->scale_factor = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.alpha = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.beta = NULL; \
+			( post_ops->eltwise + activator_idx )->algo.algo_type = GELU_ERF; \
+		} \
+		if ( is_clip == TRUE ) \
 		{ \
-			post_ops->eltwise.algo.alpha = malloc( sizeof( C_type ) ); \
-			*( ( C_type* ) post_ops->eltwise.algo.alpha ) = ( C_type )6; \
-			post_ops->eltwise.algo.algo_type = PRELU; \
+			( post_ops->eltwise + clip_idx )->is_power_of_2 = FALSE; \
+			( post_ops->eltwise + clip_idx )->scale_factor = NULL; \
+			( post_ops->eltwise + clip_idx )->algo.alpha = malloc( sizeof( C_type ) ); \
+			( post_ops->eltwise + clip_idx )->algo.beta = malloc( sizeof( C_type ) ); \
+			*( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.alpha ) = ( C_type ) ( -64 ); \
+			*( ( C_type* ) ( post_ops->eltwise + clip_idx )->algo.beta ) = ( C_type ) ( 3 ); \
+			( post_ops->eltwise + clip_idx )->algo.algo_type = CLIP; \
 		} \
-		post_ops->eltwise.algo.beta = NULL; \
 	} \
  \
 	if ( global_dscale_out == 'y' ) \
@@ -651,6 +944,7 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \
 			post_ops->sum.scale_factor = malloc( n * sizeof( DSCALE_type ) ); \
 			if ( post_ops->sum.scale_factor == NULL ) \
 			{ \
+				free ( post_ops->eltwise ); \
 				free ( post_ops->bias.bias ); \
 				free( post_ops->seq_vector ); \
 				free( post_ops ); \
@@ -672,8 +966,10 @@ aocl_post_op* lpgemm_create_post_ops_struct_ ## BLAS_SFX \
 
 GEN_MAT_MUL_POST_OPS_CREATOR(int16_t,float,u8s8s16os16)
 GEN_MAT_MUL_POST_OPS_CREATOR(int32_t,float,u8s8s32os32)
-GEN_MAT_MUL_POST_OPS_CREATOR(float,float,bf16bf16f32of32) 
+GEN_MAT_MUL_POST_OPS_CREATOR(float,float,bf16bf16f32of32)
 GEN_MAT_MUL_POST_OPS_CREATOR(float,float,f32f32f32of32)
+GEN_MAT_MUL_POST_OPS_CREATOR(int32_t,float,s8s8s32os32)
+GEN_MAT_MUL_POST_OPS_CREATOR(int16_t,float,s8s8s16os16)
 
 void lpgemm_destroy_post_ops_struct( aocl_post_op* post_ops )
 {
@@ -682,9 +978,20 @@ void lpgemm_destroy_post_ops_struct( aocl_post_op* post_ops )
 		return;
 	}
 
-	if ( post_ops->eltwise.algo.alpha != NULL )
+	if ( post_ops->eltwise != NULL )
 	{
-		free( post_ops->eltwise.algo.alpha );
+		for ( dim_t i = 0; i < num_eltwise; ++i )
+		{
+			if ( ( post_ops->eltwise + i )->algo.alpha != NULL )
+			{
+				free( ( post_ops->eltwise + i )->algo.alpha );
+			}
+			if ( ( post_ops->eltwise + i )->algo.beta != NULL )
+			{
+				free( ( post_ops->eltwise + i )->algo.beta );
+			}
+		}
+		free( post_ops->eltwise );
 	}
 	if ( post_ops->sum.scale_factor != NULL )
 	{
@@ -740,6 +1047,15 @@ void mat_mul_bench_main_ ## BLAS_SFX \
  \
 	C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n ); \
 	memset( ( void* ) c_ref, 0, sizeof( C_type ) * m * n ); \
+ \
+	GEN_FUNC_NAME(fill_array_,A_type)( a, ( m * k ) ); \
+	GEN_FUNC_NAME(fill_array_,B_type)( b, ( k * n ) ); \
+ \
+	if ( bench_mode == 'a' ) \
+	{ \
+		GEN_FUNC_NAME(fill_array_,C_type)( c, ( m * n ) ); \
+		GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( m * n ) ); \
+	} \
  \
 	C_type alpha; \
 	C_type beta; \
@@ -753,9 +1069,6 @@ void mat_mul_bench_main_ ## BLAS_SFX \
 		alpha = 2; \
 		beta = 9; \
 	} \
- \
-	GEN_FUNC_NAME(fill_array_,A_type)( a, ( m * k ) ); \
-	GEN_FUNC_NAME(fill_array_,B_type)( b, ( k * n ) ); \
  \
 	aocl_post_op* post_op = NULL; \
 	if ( ( post_ops_str != NULL ) || ( global_dscale_out == 'y' ) ) \
@@ -846,6 +1159,10 @@ GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int8_t,u8s8s16os8,u8s8s16os16)
 GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int32_t,u8s8s32os32,u8s8s32os32)
 GEN_MAT_MUL_BENCH_MAIN_FUNC(uint8_t,int8_t,int8_t,u8s8s32os8,u8s8s32os32)
 GEN_MAT_MUL_BENCH_MAIN_FUNC(float,float,float,f32f32f32of32,f32f32f32of32)
+GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int32_t,s8s8s32os32,s8s8s32os32)
+GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int8_t,s8s8s32os8,s8s8s32os32)
+GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int16_t,s8s8s16os16,s8s8s16os16)
+GEN_MAT_MUL_BENCH_MAIN_FUNC(int8_t,int8_t,int8_t,s8s8s16os8,s8s8s16os16)
 
 #define GEN_MAT_MUL_BENCH_MAIN_FUNC_BF16(C_type, BLAS_SFX) \
 void mat_mul_bench_main_ ## BLAS_SFX \
@@ -897,6 +1214,12 @@ void mat_mul_bench_main_ ## BLAS_SFX \
  \
 	C_type* c_ref = ( C_type* ) bli_malloc_user( sizeof( C_type ) * m * n ); \
 	memset( ( void* ) c_ref, 0, sizeof( C_type ) * m * n ); \
+ \
+	if ( bench_mode == 'a' ) \
+	{ \
+		GEN_FUNC_NAME(fill_array_,C_type)( c, ( m * n ) ); \
+		GEN_FUNC_NAME(fill_array_,C_type)( c_ref, ( m * n ) ); \
+	} \
  \
 	float alpha; \
 	float beta; \
@@ -945,7 +1268,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \
 		bfloat16* b_reorder = ( bfloat16* ) bli_malloc_user( b_reorder_buf_siz_req ); \
 			aocl_reorder_bf16bf16f32of32( 'B', b, b_reorder, k, n, stride_b ); \
  \
-		GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \
+ 		GEN_FUNC_NAME(mat_mul_bench_driver_,BLAS_SFX) \
 		( \
 		  stor_order, op_t, n_repeats, m, n, k, \
 		  alpha, \
@@ -957,7 +1280,7 @@ void mat_mul_bench_main_ ## BLAS_SFX \
 		); \
 	} \
  \
-if ( bench_mode == 'a' ) \
+	if ( bench_mode == 'a' ) \
 	{ \
 		printf(" Running accuracy check.\n"); \
 		GEN_FUNC_NAME(mat_mul_accuracy_check_driver_,BLAS_SFX) \
@@ -1009,16 +1332,36 @@ int main( int argc, char** argv )
 	FILE* fin  = NULL;
 	if ( argc < 5 )
 	{
-		printf( "Usage: ./mat_mul -i input.txt -m mode < -n 1000 -o op1,op2.. >" \
-						"\nMode is either a or p. a is used for accuracy test, " \
-						"whereas p is used for performance benchmarking." \
-						"\nn_repeats can be set optionally using -n arg." \
-						"\nPost ops can be executed optionaly by providing a " \
-						"coma separated list of ops after -o arg.\nCurrently " \
-						"bias and relu/prelu is supported and can be specified " \
-			 			"as a single post op or combination of the same. eg: -o bias,relu ; -o prelu." \
-						"\nDownscaled version of an API can be enabled by using -d arg. " \
-						"downscale is used to enable- u8s8s32os8, u8s8s16os8 or bf16bf16f32obf16 \n" );
+		printf
+		(
+		  "Usage: ./bench_lpgemm -i input.txt -m mode < -n 100 -o op1,op2 >\n" \
+		  "--Mode is either a or p.\n" \
+		  "\ta is used for accuracy testing.\n" \
+		  "\tp is used for performance benchmarking.\n" \
+		  "--n_repeats can be set optionally using -n arg.\n" \
+		  "--Post ops can be executed optionaly by providing a coma separated\n" \
+		  "  list of post-ops after -o arg. Following post-ops are supported:\n" \
+		  "    1. bias\n" \
+		  "    2. 4 activators\n" \
+		  "      a. relu\n" \
+		  "      b. prelu\n" \
+		  "      c. gelu_tanh\n" \
+		  "      d. gelu_erf\n" \
+		  "    3.clip\n" \
+		  "  Atleast one post-op needs to be specified if the -o arg is used.\n" \
+		  "  eg: -o gelu_tanh; -o bias,relu ; -o clip,prelu,bias.\n" \
+		  "  It is to be noted only one activator can be used at a time.\n" \
+		  "  If more than one activator is used, only the first activator is\n" \
+		  "  applied and the other activators are ignored.\n" \
+		  "--Downscaled version of an API is enabled by using -d arg.\n" \
+		  "  Downscaled api's are used to enable quantization workflows.\n" \
+		  "  Following downscaled api's are supported:\n" \
+		  "    1. u8s8s32os32 -d = u8s8s32os8.\n" \
+		  "    2. u8s8s16os16 -d = u8s8s16os8.\n" \
+		  "    3. bf16bf16f32obf32 -d = bf16bf16f32obf16.\n" \
+		  "    4. s8s8s32os32 -d = s8s8s32os8.\n" \
+		  "    5. s8s8s16os16 -d = s8s8s16os8.\n" \
+		);
 		exit( 1 );
 	}
 
@@ -1055,7 +1398,9 @@ int main( int argc, char** argv )
 
 	if ( post_ops_str != NULL )
 	{
-		post_ops_str_dest = strdup( post_ops_str );
+		post_ops_str_dest = ( char* )malloc \
+				( ( strlen( post_ops_str) + 1 )* sizeof( char ) );
+		strcpy( post_ops_str_dest, post_ops_str );
 	}
 
 	if ( bench_mode == 'p' )
@@ -1081,9 +1426,9 @@ int main( int argc, char** argv )
 	}
 
 	FILE* fout = NULL;
-	
+
 	fout = fopen( "lpgemm_accuracy_test_failures.txt", "w" );
-	
+
 	char op_type_char;
 	char op_t;
 	char stor_order;
@@ -1110,7 +1455,7 @@ int main( int argc, char** argv )
 #ifdef BLIS_ENABLE_OPENMP
 			omp_set_num_threads( list_omp_cores_for_testing[core_index] );
 #endif
-			printf( "Accuracy test using %ld threads.\n", 
+			printf( "Accuracy test using %ld threads.\n",
 							list_omp_cores_for_testing[core_index] );
 
 			core_index++;
@@ -1160,7 +1505,7 @@ int main( int argc, char** argv )
 				(
 				  fin, fout, stor_order, op_t,
 				  m, n, k, stride_a, stride_b, stride_c,
-				  NULL
+				  post_ops_str_dest
 				);
 			}
 			else if ((op_type_char == 's') || (op_type_char == 'S'))
@@ -1184,7 +1529,7 @@ int main( int argc, char** argv )
 					);
 				}
 			}
-			if ((op_type_char == 'b') || (op_type_char == 'B'))
+			else if ((op_type_char == 'b') || (op_type_char == 'B'))
 			{
 				if ( global_dscale_out == 'n' )
 				{
@@ -1203,7 +1548,49 @@ int main( int argc, char** argv )
 						m, n, k, stride_a, stride_b, stride_c,
 						post_ops_str_dest
 					);
-				}	
+				}
+			}
+			else if ( ( op_type_char == 'u' ) || ( op_type_char == 'U' ) )
+			{
+				if ( global_dscale_out == 'n' )
+				{
+					GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os32)
+					(
+					  fin, fout, stor_order, op_t,
+					  m, n, k, stride_a, stride_b, stride_c,
+					  post_ops_str_dest
+					);
+				}
+				else
+				{
+					GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s32os8)
+					(
+					  fin, fout, stor_order, op_t,
+					  m, n, k, stride_a, stride_b, stride_c,
+					  post_ops_str_dest
+					);
+				}
+			}
+			else if ( ( op_type_char == 'v' ) || ( op_type_char == 'V' ) )
+			{
+				if ( global_dscale_out == 'n' )
+				{
+					GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os16)
+					(
+					  fin, fout, stor_order, op_t,
+					  m, n, k, stride_a, stride_b, stride_c,
+					  post_ops_str_dest
+					);
+				}
+				else
+				{
+					GEN_FUNC_NAME(mat_mul_bench_main_,s8s8s16os8)
+					(
+					  fin, fout, stor_order, op_t,
+					  m, n, k, stride_a, stride_b, stride_c,
+					  post_ops_str_dest
+					);
+				}
 			}
 			if ( post_ops_str != NULL )
 			{
diff --git a/bench/bench_aocl_gemm/bench_lpgemm_utils.c b/bench/bench_aocl_gemm/bench_lpgemm_utils.c
new file mode 100644
index 0000000000..dbbdce6703
--- /dev/null
+++ b/bench/bench_aocl_gemm/bench_lpgemm_utils.c
@@ -0,0 +1,392 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <float.h>
+#include <unistd.h>
+#include <math.h>
+
+#include "blis.h"
+
+// Mode can be one of the follwoing:
+// 	1. p - performance, used for benchmarks.
+// 	2. a - accuracy, used to test accuracy/correctness.
+// Default value is p, can be modified by passing command line arg.
+char bench_mode = 'p';
+
+int32_t global_n_repeat = 0;
+
+#define _XSTR(str) #str
+#define XSTR(str) _XSTR(str)
+
+#define GEN_FUNC_NAME(prototype,ctype) prototype ## ctype
+
+#define GEN_FILL_ARRAY_FUNC(ctype) \
+void fill_array_ ## ctype ( void* arr, dim_t size ) \
+{ \
+	ctype* temp_arr = ( ctype* ) arr; \
+	for ( dim_t i = 0; i < size; ++i ) \
+	{ \
+		temp_arr[i] = ( ctype )( i % 10 ); \
+	} \
+} \
+
+GEN_FILL_ARRAY_FUNC(float)
+
+void print_result
+     (
+       const char* msg,
+       int32_t     n_repeats,
+       dim_t       n,
+       dim_t       incx,
+       double      runtime
+     )
+{
+	printf("%s n: %ld, incx: %ld, runtime: %f s, n_repeats: %d\n", \
+			msg, n, incx, runtime, n_repeats);
+}
+
+#define GEN_GELU_BENCH_DRV_FN(V_type,GELU_SFX) \
+void gelu_bench_driver_ ## GELU_SFX \
+     ( \
+       int32_t n_repeats, \
+       dim_t   n, \
+       V_type* x, \
+       inc_t   incx \
+     ) \
+{ \
+	double min_time_diff = DBL_MAX; \
+	for ( int32_t nr = 0; nr < n_repeats; ++nr ) \
+	{ \
+		struct timespec tstart={0,0}, tend={0,0}; \
+		clock_gettime(CLOCK_MONOTONIC, &tstart); \
+ \
+		if ( bench_mode == 'a' ) \
+		{ \
+			GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx) ); \
+		} \
+ \
+		GEN_FUNC_NAME(aocl_,GELU_SFX) \
+		( \
+		  n, x, incx \
+		); \
+ \
+		clock_gettime(CLOCK_MONOTONIC, &tend); \
+ \
+		double diff = \
+			( ( double ) tend.tv_sec + ( 1.0e-9 * tend.tv_nsec ) ) - \
+			( ( double ) tstart.tv_sec + ( 1.0e-9 * tstart.tv_nsec ) ); \
+		min_time_diff = ( diff < min_time_diff ) ? diff : min_time_diff; \
+	} \
+ \
+	print_result( XSTR(GELU_SFX), n_repeats, n, incx, min_time_diff); \
+} \
+
+GEN_GELU_BENCH_DRV_FN(float,gelu_tanh_f32)
+GEN_GELU_BENCH_DRV_FN(float,gelu_erf_f32)
+
+#define GEN_SOFTMAX_BENCH_DRV_FN(V_type,SOFTMAX_SFX) \
+void softmax_bench_driver_ ## SOFTMAX_SFX \
+     ( \
+       int32_t n_repeats, \
+       dim_t   n, \
+       V_type* x, \
+       inc_t   incx \
+     ) \
+{ \
+	double min_time_diff = DBL_MAX; \
+	for ( int32_t nr = 0; nr < n_repeats; ++nr ) \
+	{ \
+		struct timespec tstart={0,0}, tend={0,0}; \
+		clock_gettime(CLOCK_MONOTONIC, &tstart); \
+ \
+		if ( bench_mode == 'a' ) \
+		{ \
+			GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx) ); \
+		} \
+ \
+		GEN_FUNC_NAME(aocl_,SOFTMAX_SFX) \
+		( \
+		  n, x, incx \
+		); \
+ \
+		clock_gettime(CLOCK_MONOTONIC, &tend); \
+ \
+		double diff = \
+			( ( double ) tend.tv_sec + ( 1.0e-9 * tend.tv_nsec ) ) - \
+			( ( double ) tstart.tv_sec + ( 1.0e-9 * tstart.tv_nsec ) ); \
+		min_time_diff = ( diff < min_time_diff ) ? diff : min_time_diff; \
+	} \
+ \
+	print_result( XSTR(SOFTMAX_SFX), n_repeats, n, incx, min_time_diff); \
+} \
+
+GEN_SOFTMAX_BENCH_DRV_FN(float,softmax_f32)
+
+inline float gelu_tanh_f32
+     (
+       float temp_accum
+     )
+{
+	temp_accum = 0.5 *(double)temp_accum * (1 + tanhf( 0.797884 * ( (double)temp_accum + \
+	              ( 0.044715 * ((double)temp_accum * (double)temp_accum * \
+				  (double)temp_accum ) ) ) ) );
+	return temp_accum;
+}\
+
+inline float gelu_erf_f32
+     (
+       float temp_accum
+     )
+{
+	temp_accum = 0.5 *(double)temp_accum * (1 + erff( (double)temp_accum * 0.707107 ));
+	return temp_accum;
+}
+
+#define GEN_GELU_ACC_CHK_FN(V_type,GELU_SFX) \
+void gelu_acc_check_ ## GELU_SFX \
+     ( \
+       FILE*   fout, \
+       dim_t n, \
+       V_type* x, \
+       V_type* ref_x, \
+       inc_t incx \
+     ) \
+{ \
+	for ( dim_t idx = 0; idx < ( n * incx ); idx += incx ) \
+	{ \
+		V_type temp_acc = GELU_SFX( *( ref_x + idx ) ); \
+		if ( temp_acc != *( x + idx ) ) \
+		{ \
+			if ( fout ) \
+			{ \
+				fprintf( fout, "%s Failure input n: %ld, incx: %ld, idx: %ld \n", \
+								XSTR(GELU_SFX), n, incx, ( idx / incx ) ); \
+				fflush( fout ); \
+			} \
+			printf("%s failure, n: %ld, incx: %ld, idx: %ld, ref: %f, calc: %f\n", \
+						XSTR(GELU_SFX), n, incx, ( idx / incx ), temp_acc, *(x + idx)); \
+			goto cleanup_acc; \
+		} \
+	} \
+cleanup_acc: \
+	return; \
+} \
+
+GEN_GELU_ACC_CHK_FN(float,gelu_tanh_f32)
+GEN_GELU_ACC_CHK_FN(float,gelu_erf_f32)
+
+#define GEN_SOFTMAX_ACC_CHK_FN(V_type,SOFTMAX_SFX) \
+void softmax_acc_check_ ## SOFTMAX_SFX \
+     ( \
+       FILE*   fout, \
+       dim_t n, \
+       V_type* x, \
+       V_type* ref_x, \
+       inc_t incx \
+     ) \
+{ \
+	double exp_sum = 0.0; \
+	for ( dim_t idx = 0; idx < ( n * incx ); idx += incx )\
+	{ \
+		exp_sum += ( double )expf( *(ref_x + idx ) ); \
+	} \
+	for ( dim_t idx = 0; idx < ( n * incx ); idx += incx ) \
+	{ \
+		V_type temp_acc = ( V_type )( ( ( double )*( ref_x + idx ) ) / exp_sum ); \
+		if ( temp_acc != *( x + idx ) ) \
+		{ \
+			if ( fout ) \
+			{ \
+				fprintf( fout, "%s Failure input n: %ld, incx: %ld, idx: %ld \n", \
+								XSTR(SOFTMAX_SFX), n, incx, ( idx / incx ) ); \
+				fflush( fout ); \
+			} \
+			printf("%s failure, n: %ld, incx: %ld, idx: %ld, ref: %.10f, calc: %.10f\n", \
+						XSTR(SOFTMAX_SFX), n, incx, ( idx / incx ), temp_acc, *(x + idx)); \
+			goto cleanup_acc; \
+		} \
+	} \
+cleanup_acc: \
+	return; \
+} \
+
+GEN_SOFTMAX_ACC_CHK_FN(float,softmax_f32)
+
+#define GEN_GELU_BENCH_MAIN_FN(V_type,GELU_SFX) \
+void gelu_bench_main_ ## GELU_SFX \
+    ( \
+       FILE*   fout, \
+       dim_t n, \
+       inc_t incx \
+     ) \
+{ \
+	int32_t n_repeats = 1000; \
+	if ( global_n_repeat > 0 ) \
+	{ \
+		n_repeats = global_n_repeat; \
+	} \
+ \
+	V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
+	GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx ) ); \
+ \
+	V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
+	GEN_FUNC_NAME(fill_array_,V_type)( ref_x, ( n * incx ) ); \
+ \
+	GEN_FUNC_NAME(gelu_bench_driver_,GELU_SFX)(n_repeats,n,x,incx); \
+ \
+	if ( bench_mode == 'a' ) \
+	{ \
+		GEN_FUNC_NAME(gelu_acc_check_,GELU_SFX)(fout,n,x,ref_x,incx); \
+	} \
+} \
+
+GEN_GELU_BENCH_MAIN_FN(float,gelu_tanh_f32)
+GEN_GELU_BENCH_MAIN_FN(float,gelu_erf_f32)
+
+#define GEN_SOFTMAX_BENCH_MAIN_FN(V_type,SOFTMAX_SFX) \
+void softmax_bench_main_ ## SOFTMAX_SFX \
+    ( \
+       FILE*   fout, \
+       dim_t n, \
+       inc_t incx \
+     ) \
+{ \
+	int32_t n_repeats = 1000; \
+	if ( global_n_repeat > 0 ) \
+	{ \
+		n_repeats = global_n_repeat; \
+	} \
+ \
+	V_type* x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
+	GEN_FUNC_NAME(fill_array_,V_type)( x, ( n * incx ) ); \
+ \
+	V_type* ref_x = ( V_type* ) bli_malloc_user( sizeof( V_type ) * n * incx ); \
+	GEN_FUNC_NAME(fill_array_,V_type)( ref_x, ( n * incx ) ); \
+ \
+	GEN_FUNC_NAME(softmax_bench_driver_,SOFTMAX_SFX)(n_repeats,n,x,incx); \
+ \
+	if ( bench_mode == 'a' ) \
+	{ \
+		GEN_FUNC_NAME(softmax_acc_check_,SOFTMAX_SFX)(fout,n,x,ref_x,incx); \
+	} \
+} \
+
+GEN_SOFTMAX_BENCH_MAIN_FN(float,softmax_f32)
+
+int main( int argc, char** argv )
+{
+	FILE* fin  = NULL;
+	if ( argc < 5 )
+	{
+		printf( "Usage: ./bench_lpgemm_utils -i input.txt -m mode < -n 1000 >" \
+						"\nMode is either a or p. a is used for accuracy test, " \
+						"whereas p is used for performance benchmarking." \
+						"\nn_repeats can be set optionally using -n arg.\n" );
+		exit( 1 );
+	}
+
+	char* file_name = NULL;
+
+	// Parse CLI arguments.
+	opterr = 0;
+	int opt_val;
+	while ( ( opt_val = getopt( argc, argv, "i:m:n:" ) ) != -1 )
+	{
+		switch ( opt_val )
+		{
+			case 'i':
+					file_name = optarg;
+					break;
+			case 'm':
+					bench_mode = ( ( ( *optarg ) == 'a' ) || ( ( *optarg ) == 'p' ) ) ? ( *optarg ) : 'p';
+					break;
+			case 'n':
+					global_n_repeat = ( atoi( optarg ) > 0 ) ? atoi( optarg ) : 0;
+					break;
+			default:
+					break;
+		}
+	}
+
+	if ( bench_mode == 'p' )
+	{
+		printf( "Running bench in performance benchmarking mode.\n" );
+	}
+	else if ( bench_mode == 'a' )
+	{
+		printf( "Running bench in accuracy/correctness testing mode.\n" );
+	}
+
+	if ( file_name == NULL )
+	{
+		printf( " File name provided is invalid.\n" );
+		exit( 1 );
+	}
+
+	fin = fopen( file_name, "r" );
+	if (fin == NULL)
+	{
+		printf( "Error opening the file %s\n", argv[1] );
+		exit( 1 );
+	}
+
+	FILE* fout = NULL;
+
+	fout = fopen( "lpgemm_accuracy_test_failures.txt", "w" );
+
+	char l1_op_type[128];
+	dim_t n;
+	inc_t incx;
+	while ( fscanf( fin, "%s %ld %ld\n", l1_op_type, &n, &incx )  == 3 )
+	{
+		if ( strcmp( l1_op_type, "f32_gelu_tanh" ) == 0 )
+		{
+			gelu_bench_main_gelu_tanh_f32( fout, n, incx );
+		}
+		else if ( strcmp( l1_op_type, "f32_gelu_erf" ) == 0 )
+		{
+			gelu_bench_main_gelu_erf_f32( fout, n, incx );
+		}
+		else if ( strcmp( l1_op_type, "f32_softmax" ) == 0 )
+		{
+			softmax_bench_main_softmax_f32( fout, n, incx );
+		}
+	}
+
+	return 0;
+}
diff --git a/bench/bench_aocl_gemm/bench_utils_input.txt b/bench/bench_aocl_gemm/bench_utils_input.txt
new file mode 100644
index 0000000000..af9051b6a4
--- /dev/null
+++ b/bench/bench_aocl_gemm/bench_utils_input.txt
@@ -0,0 +1,33 @@
+f32_softmax 1 1
+f32_softmax 2 1
+f32_softmax 4 1
+f32_softmax 21 1
+f32_softmax 64 1
+f32_gelu_tanh 1 1
+f32_gelu_tanh 2 1
+f32_gelu_tanh 8 1
+f32_gelu_tanh 16 1
+f32_gelu_tanh 21 1
+f32_gelu_tanh 64 1
+f32_gelu_tanh 1029 1
+f32_gelu_erf 1 1
+f32_gelu_erf 2 1
+f32_gelu_erf 8 1
+f32_gelu_erf 16 1
+f32_gelu_erf 21 1
+f32_gelu_erf 64 1
+f32_gelu_erf 1029 1
+f32_gelu_tanh 1 9
+f32_gelu_tanh 2 9
+f32_gelu_tanh 8 9
+f32_gelu_tanh 16 1024
+f32_gelu_tanh 21 1024
+f32_gelu_tanh 64 1024
+f32_gelu_tanh 1029 512
+f32_gelu_erf 1 9
+f32_gelu_erf 2 9
+f32_gelu_erf 8 9
+f32_gelu_erf 16 1024
+f32_gelu_erf 21 1024
+f32_gelu_erf 64 1024
+f32_gelu_erf 1029 512
diff --git a/bench/bench_aocl_gemm/test_small.txt b/bench/bench_aocl_gemm/test_small.txt
new file mode 100644
index 0000000000..13f47bce81
--- /dev/null
+++ b/bench/bench_aocl_gemm/test_small.txt
@@ -0,0 +1,54 @@
+i r r 4 3 204 204 3 3
+i r r 6 5 204 204 5 5
+i r r 6 7 204 204 7 7
+i r r 6 9 204 204 9 9
+i r r 78402 8 190 190 8 8
+i r r 78402 9 190 190 9 9
+i r r 78402 10 190 190 10 10
+i r r 78402 11 190 190 11 11
+i r r 78402 12 190 190 12 12
+i r r 78402 13 190 190 13 13
+i r r 78402 14 190 190 14 14
+i r r 78402 15 190 190 15 15
+i r r 78403 8 190 190 8 8
+i r r 78403 9 190 190 9 9
+i r r 78403 10 190 190 10 10
+i r r 78403 11 190 190 11 11
+i r r 78403 12 190 190 12 12
+i r r 78403 13 190 190 13 13
+i r r 78403 14 190 190 14 14
+i r r 78403 15 190 190 15 15
+i r r 78404 8 190 190 8 8
+i r r 78404 9 190 190 9 9
+i r r 78404 10 190 190 10 10
+i r r 78404 11 190 190 11 11
+i r r 78404 12 190 190 12 12
+i r r 78404 13 190 190 13 13
+i r r 78404 14 190 190 14 14
+i r r 78404 15 190 190 15 15
+i r r 78405 8 190 190 8 8
+i r r 78405 9 190 190 9 9
+i r r 78405 10 190 190 10 10
+i r r 78405 11 190 190 11 11
+i r r 78405 12 190 190 12 12
+i r r 78405 13 190 190 13 13
+i r r 78405 14 190 190 14 14
+i r r 78405 15 190 190 15 15
+i r r 78406 8 190 190 8 8
+i r r 78406 9 190 190 9 9
+i r r 78406 10 190 190 10 10
+i r r 78406 11 190 190 11 11
+i r r 78406 12 190 190 12 12
+i r r 78406 13 190 190 13 13
+i r r 78406 14 190 190 14 14
+i r r 78406 15 190 190 15 15
+i r r 78407 8 190 190 8 8
+i r r 78407 9 190 190 9 9
+i r r 78407 10 190 190 10 10
+i r r 78407 11 190 190 11 11
+i r r 78407 12 190 190 12 12
+i r r 78407 13 190 190 13 13
+i r r 78407 14 190 190 14 14
+i r r 78407 15 190 190 15 15
+###
+i r r 78402 16 190 190 16 16
diff --git a/bench/bench_axpbyv.c b/bench/bench_axpbyv.c
index c962079dd6..db62ead33e 100644
--- a/bench/bench_axpbyv.c
+++ b/bench/bench_axpbyv.c
@@ -262,4 +262,4 @@ int main( int argc, char** argv )
 	}
 
 	return 0;
-}
\ No newline at end of file
+}
diff --git a/bench/bench_gemm.c b/bench/bench_gemm.c
index 908ce0fca5..d9dc523e92 100755
--- a/bench/bench_gemm.c
+++ b/bench/bench_gemm.c
@@ -109,6 +109,10 @@ int main( int argc, char** argv )
         printf("Error opening output file %s\n", argv[2]);
         exit(1);
     }
+	if (argc > 3)
+	{
+		n_repeats = atoi(argv[3]);
+	}
 
     fprintf(fout, "Dt transa transb m n k alphaR alphaI lda ldb betaR betaI ldc gflops\n");
 
diff --git a/bench/bench_swapv.c b/bench/bench_swapv.c
index 34af6b7975..6f2c8fd90e 100644
--- a/bench/bench_swapv.c
+++ b/bench/bench_swapv.c
@@ -248,4 +248,4 @@ int main( int argc, char** argv )
     fclose(fout);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/bench/bench_trsv.c b/bench/bench_trsv.c
index ddf3ea187a..425f61f1d0 100644
--- a/bench/bench_trsv.c
+++ b/bench/bench_trsv.c
@@ -395,4 +395,4 @@ int main( int argc, char** argv )
 
     // bli_finalize();
     return 0;
-}
\ No newline at end of file
+}
diff --git a/bench/inputnrm2.txt b/bench/inputnrm2.txt
index 567d6e4691..517f5eac41 100644
--- a/bench/inputnrm2.txt
+++ b/bench/inputnrm2.txt
@@ -39,4 +39,4 @@ dnrm2:171: D 8192  5
 dnrm2:171: D 16384 11
 dnrm2:171: D 20976 3
 dnrm2:171: D 56841 19
-dnrm2:171: D 65536 6
\ No newline at end of file
+dnrm2:171: D 65536 6
diff --git a/blastest/f2c/CMakeLists.txt b/blastest/f2c/CMakeLists.txt
index 00d8291164..87ec3b6a5b 100644
--- a/blastest/f2c/CMakeLists.txt
+++ b/blastest/f2c/CMakeLists.txt
@@ -56,4 +56,4 @@ target_sources("${F2C_LIB}"
     ${CMAKE_CURRENT_SOURCE_DIR}/wrtfmt.c
     ${CMAKE_CURRENT_SOURCE_DIR}/wsfe.c
     ${CMAKE_CURRENT_SOURCE_DIR}/wsle.c
-    )
\ No newline at end of file
+    )
diff --git a/blastest/f2c/endfile.c b/blastest/f2c/endfile.c
index def9988d12..8be2d826b5 100644
--- a/blastest/f2c/endfile.c
+++ b/blastest/f2c/endfile.c
@@ -21,6 +21,10 @@ other tortious action, arising out of or in connection with the
 use or performance of this software.
 ****************************************************************/
 
+/*
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+*/
+
 #include <f2c_config.h>
 #include "f2c.h"
 #include "fio.h"
@@ -43,7 +47,9 @@ integer f_end(alist *a)
 	if(a->aunit>=MXUNIT || a->aunit<0) err(a->aerr,101,"endfile");
 	b = &f__units[a->aunit];
 	if(b->ufd==NULL) {
-		char nbuf[10];
+		/* Increased buffer size from 10 to 17 to eliminate
+		   warning message from gcc. */
+		char nbuf[17];
 		sprintf(nbuf,"fort.%ld",(long)a->aunit);
 		if (tf = fopen(nbuf, f__w_mode[0]))
 			fclose(tf);
diff --git a/blastest/f2c/f2c_config.h b/blastest/f2c/f2c_config.h
index af39bbe5d0..303cddc517 100644
--- a/blastest/f2c/f2c_config.h
+++ b/blastest/f2c/f2c_config.h
@@ -188,4 +188,4 @@
 
 #ifdef _MSC_VER
 #define NON_UNIX_STDIO 1
-#endif
\ No newline at end of file
+#endif
diff --git a/build/auto_config.py b/build/auto_config.py
index 4221ca637d..1ce3989e4e 100644
--- a/build/auto_config.py
+++ b/build/auto_config.py
@@ -1,73 +1,73 @@
-"""Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved"""
-
-import subprocess
-import sys
-
-def config_check():
-    # Execute wmic shell command with sub-process
-    result = subprocess.run(['wmic', 'cpu', 'get', 'caption'], stdout=subprocess.PIPE, text=True).stdout
-
-    # Replace the newline character with empty char
-    result=result.replace('\n', '')
-
-    # parse the string into list of string
-    parse_string=result.split(" ")
-
-    # Strip the empty strings from list
-    parse_string=[list for list in parse_string if list.strip()]
-
-    vendor=parse_string[1]
-    family=hex(int(parse_string[3]))
-    model=hex(int(parse_string[5]))
-    stepping=hex(int(parse_string[7]))
-
-    # AMD family numbers
-    # Zen/ Zen+/Zen2 family number
-    zen_family="0x17"
-    # Bulldozer / Piledriver / Steamroller / Excavator family number
-    amd_family="0x15"
-
-    # AMD CPUID model numbers
-    zen_model=["0x30", "0xff"]
-    zen2_model=["0x00", "0xff"]
-    excavator_model=["0x60","0x7f"]
-    steamroller_model=["0x30", "0x3f"]
-    piledriver_model=["0x02", "0x10", "0x1f"]
-    bulldozer_model=["0x00", "0x01"]
-
-    # Check the CPU configuration Intel64/AMD64
-    if vendor.count("Intel64"):
-        return
-    elif vendor.count("AMD64"):
-        # Check the AMD family name
-        if family == zen_family:
-            if (zen_model[0] <= model and model <= zen_model[1]) :
-                family="zen2"
-            elif (zen2_model[0] <= model and model <= zen2_model[1]) :
-                family="zen"
-            else:
-                print("Unknown model number")
-        elif family == amd_family:
-            # check for specific models of excavator family
-            if (excavator_model[0] <= model and model <= excavator_model[1]) :
-                family="excavator"
-            # check for specific models of steamroller family
-            elif (steamroller_model[0] <= model and model <= steamroller_model[1]) :
-                family="steamroller"
-            # check for specific models of piledriver family
-            elif (model == piledriver_model[0] or (piledriver_model[1] <= model and model <= piledriver_model[2])) :
-                family="piledriver"
-            # check for specific models of bulldozer family
-            elif (model == bulldozer_model[0] or model == bulldozer_model[1]) :
-                family="bulldozer"
-            else:
-                print("Unknown model number")
-        else:
-            print("Unknown family")
-    else:
-        print("UNKNOWN CPU")
-    return family
-
-# Function call for config family names
-FAMILY=config_check()
-print(FAMILY)
\ No newline at end of file
+"""Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved"""
+
+import subprocess
+import sys
+
+def config_check():
+    # Execute wmic shell command with sub-process
+    result = subprocess.run(['wmic', 'cpu', 'get', 'caption'], stdout=subprocess.PIPE, text=True).stdout
+
+    # Replace the newline character with empty char
+    result=result.replace('\n', '')
+
+    # parse the string into list of string
+    parse_string=result.split(" ")
+
+    # Strip the empty strings from list
+    parse_string=[list for list in parse_string if list.strip()]
+
+    vendor=parse_string[1]
+    family=hex(int(parse_string[3]))
+    model=hex(int(parse_string[5]))
+    stepping=hex(int(parse_string[7]))
+
+    # AMD family numbers
+    # Zen/ Zen+/Zen2 family number
+    zen_family="0x17"
+    # Bulldozer / Piledriver / Steamroller / Excavator family number
+    amd_family="0x15"
+
+    # AMD CPUID model numbers
+    zen_model=["0x30", "0xff"]
+    zen2_model=["0x00", "0xff"]
+    excavator_model=["0x60","0x7f"]
+    steamroller_model=["0x30", "0x3f"]
+    piledriver_model=["0x02", "0x10", "0x1f"]
+    bulldozer_model=["0x00", "0x01"]
+
+    # Check the CPU configuration Intel64/AMD64
+    if vendor.count("Intel64"):
+        return
+    elif vendor.count("AMD64"):
+        # Check the AMD family name
+        if family == zen_family:
+            if (zen_model[0] <= model and model <= zen_model[1]) :
+                family="zen2"
+            elif (zen2_model[0] <= model and model <= zen2_model[1]) :
+                family="zen"
+            else:
+                print("Unknown model number")
+        elif family == amd_family:
+            # check for specific models of excavator family
+            if (excavator_model[0] <= model and model <= excavator_model[1]) :
+                family="excavator"
+            # check for specific models of steamroller family
+            elif (steamroller_model[0] <= model and model <= steamroller_model[1]) :
+                family="steamroller"
+            # check for specific models of piledriver family
+            elif (model == piledriver_model[0] or (piledriver_model[1] <= model and model <= piledriver_model[2])) :
+                family="piledriver"
+            # check for specific models of bulldozer family
+            elif (model == bulldozer_model[0] or model == bulldozer_model[1]) :
+                family="bulldozer"
+            else:
+                print("Unknown model number")
+        else:
+            print("Unknown family")
+    else:
+        print("UNKNOWN CPU")
+    return family
+
+# Function call for config family names
+FAMILY=config_check()
+print(FAMILY)
diff --git a/build/bli_config.h.in b/build/bli_config.h.in
index 6c17fc5e74..ba0c16100b 100644
--- a/build/bli_config.h.in
+++ b/build/bli_config.h.in
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -132,6 +132,16 @@
 #endif
 #endif
 
+// If the CBLAS compatibility layer was enabled while the BLAS layer
+// was not enabled, we must enable the BLAS layer here. Also undefine
+// BLIS_DISABLE_BLAS to ensure consistency.
+#ifdef BLIS_ENABLE_CBLAS
+#ifndef BLIS_ENABLE_BLAS
+#define BLIS_ENABLE_BLAS
+#endif
+#undef BLIS_DISABLE_BLAS
+#endif // BLIS_ENABLE_CBLAS
+
 #ifndef BLIS_ENABLE_MIXED_DT
 #ifndef BLIS_DISABLE_MIXED_DT
 #if @enable_mixed_dt@
@@ -196,8 +206,10 @@
 
 #if @disable_blis_arch_type@
 #define DISABLE_BLIS_ARCH_TYPE
+#define DISABLE_BLIS_MODEL_TYPE
 #endif
 
 #define __blis_arch_type_name "@rename_blis_arch_type@"
+#define __blis_model_type_name "@rename_blis_model_type@"
 
 #endif
diff --git a/build/bli_win_config.h.in b/build/bli_win_config.h.in
index 24e1fc3d59..4645b5cf95 100644
--- a/build/bli_win_config.h.in
+++ b/build/bli_win_config.h.in
@@ -1,54 +1,58 @@
-/*
- * Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All Rights Reserved
- */
-
-#ifndef BLIS_CONFIG_H
-#define BLIS_CONFIG_H
-
-#cmakedefine AOCL_DYNAMIC
-
-#cmakedefine AOCL_BLIS_ZEN
-
-#cmakedefine BLIS_ENABLE_OPENMP
-
-#cmakedefine BLIS_ENABLE_JRIR_SLAB
-
-#cmakedefine BLIS_ENABLE_JRIR_RR
-
-#cmakedefine BLIS_ENABLE_PBA_POOLS
-
-#cmakedefine BLIS_ENABLE_SBA_POOLS
-
-#cmakedefine BLIS_ENABLE_MEM_TRACING
-
-#cmakedefine BLIS_INT_TYPE_SIZE @INT_TYPE_SIZE@
-
-#cmakedefine BLIS_BLAS_INT_TYPE_SIZE @BLAS_INT_TYPE_SIZE@
-
-#cmakedefine BLIS_ENABLE_BLAS
-
-#cmakedefine BLIS_ENABLE_CBLAS
-
-#cmakedefine BLIS_ENABLE_MIXED_DT
-
-#cmakedefine BLIS_ENABLE_MIXED_DT_EXTRA_MEM
-
-#cmakedefine BLIS_ENABLE_SUP_HANDLING
-
-#cmakedefine BLIS_ENABLE_MEMKIND
-
-#cmakedefine BLIS_ENABLE_TRSM_PREINVERSION
-
-#cmakedefine BLIS_ENABLE_PRAGMA_OMP_SIMD
-
-#cmakedefine BLIS_ENABLE_SANDBOX
-
-#cmakedefine BLIS_ENABLE_SHARED
-
-#cmakedefine BLIS_ENABLE_COMPLEX_RETURN_INTEL
-
-#cmakedefine DISABLE_BLIS_ARCH_TYPE
-
-#cmakedefine __blis_arch_type_name "@rename_blis_arch_type@"
-
-#endif
+/*
+ * Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+ */
+
+#ifndef BLIS_CONFIG_H
+#define BLIS_CONFIG_H
+
+#cmakedefine AOCL_DYNAMIC
+
+#cmakedefine AOCL_BLIS_ZEN
+
+#cmakedefine BLIS_ENABLE_OPENMP
+
+#cmakedefine BLIS_ENABLE_JRIR_SLAB
+
+#cmakedefine BLIS_ENABLE_JRIR_RR
+
+#cmakedefine BLIS_ENABLE_PBA_POOLS
+
+#cmakedefine BLIS_ENABLE_SBA_POOLS
+
+#cmakedefine BLIS_ENABLE_MEM_TRACING
+
+#cmakedefine BLIS_INT_TYPE_SIZE @INT_TYPE_SIZE@
+
+#cmakedefine BLIS_BLAS_INT_TYPE_SIZE @BLAS_INT_TYPE_SIZE@
+
+#cmakedefine BLIS_ENABLE_BLAS
+
+#cmakedefine BLIS_ENABLE_CBLAS
+
+#cmakedefine BLIS_ENABLE_MIXED_DT
+
+#cmakedefine BLIS_ENABLE_MIXED_DT_EXTRA_MEM
+
+#cmakedefine BLIS_ENABLE_SUP_HANDLING
+
+#cmakedefine BLIS_ENABLE_MEMKIND
+
+#cmakedefine BLIS_ENABLE_TRSM_PREINVERSION
+
+#cmakedefine BLIS_ENABLE_PRAGMA_OMP_SIMD
+
+#cmakedefine BLIS_ENABLE_SANDBOX
+
+#cmakedefine BLIS_ENABLE_SHARED
+
+#cmakedefine BLIS_ENABLE_COMPLEX_RETURN_INTEL
+
+#cmakedefine DISABLE_BLIS_ARCH_TYPE
+
+#cmakedefine DISABLE_BLIS_MODEL_TYPE
+
+#cmakedefine __blis_arch_type_name "@rename_blis_arch_type@"
+
+#cmakedefine __blis_model_type_name "@rename_blis_model_type@"
+
+#endif
diff --git a/build/blis_ref_kernel_mirror.py b/build/blis_ref_kernel_mirror.py
index 834de1cee9..f49d101ae7 100644
--- a/build/blis_ref_kernel_mirror.py
+++ b/build/blis_ref_kernel_mirror.py
@@ -1,4 +1,18 @@
-"""Copyright (C) 2021, Advanced Micro Devices, Inc. All Rights Reserved"""
+"""Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All Rights Reserved"""
+
+################################################################################
+# This file is used to mirroring the refkernels folder data into to zen, zen2, #
+# zen3, zen4 and generic folder.                                               #
+# Rename all .c files by adding zen, zen2, zen3, zen4 and generic for the      #
+# corresponding folder .c files and update the corresponding CMakeLists.txt    #
+# file for amdzen (dynamic dispatcher) config option.                          #
+#                                                                              #
+# Usage:                                                                       #
+#       python blis_ref_kernel_mirror.py <project build directory name>        #
+#                                                                              #
+# Author: Chandrashekara K R <chandrkr@amd.com>                                #
+#                                                                              #
+################################################################################
 import os
 import shutil
 import subprocess
@@ -95,10 +109,25 @@ def write_to_file(filename, data):
         fd.write(data + '\n')
 
 
+def update_cmakelists_contents(cmakefiles, replacement_str):
+    for cmakefile in cmakefiles:
+        if os.path.exists(cmakefile):
+            # Updating the modified .c files name in CMakeLists.txt
+            with open(cmakefile, 'r') as fd:
+                file_content = fd.read()
+                file_content = file_content.replace(
+                    'ref.c', replacement_str + '_ref.c')
+            with open(cmakefile, 'w') as fd:
+                fd.write(file_content)
+
+
 def add_macro_to_cfiles(cfiles, macro):
     for cfile in cfiles:
         if os.path.exists(cfile):
             write_to_file(cfile, macro)
+            # Renaming the .c files name to incorporate with linux
+            os.rename(cfile,  cfile.split('ref.c')[0] + macro.split(' ')[
+                -1].split('\n')[0][1:] + '_ref.c')
 
 
 if __name__ == '__main__':
@@ -109,6 +138,7 @@ def add_macro_to_cfiles(cfiles, macro):
     if os.path.exists(dest_path):
         remove_folder(dest_path)
 
+    # Creating all the required folders
     temp = os.path.join(cwd, 'temp')
     create_folder(temp)
     execute_and_check('XCOPY {} {} /E'.format(source_path, temp))
@@ -117,6 +147,7 @@ def add_macro_to_cfiles(cfiles, macro):
     create_folder(os.path.join(dest_path, 'zen3'))
     create_folder(os.path.join(dest_path, 'zen4'))
     create_folder(os.path.join(dest_path, 'generic'))
+    # Mirroring refkernels folder data to zen, zen2, zen3, zen4 and generic folder
     execute_and_check('XCOPY {} {} /E'.format(
         temp, os.path.join(dest_path, 'zen')))
     execute_and_check('XCOPY {} {} /E'.format(
@@ -144,23 +175,53 @@ def add_macro_to_cfiles(cfiles, macro):
     cfiles_in_generic = cfiles_in_generic.split('\r\n')
     add_macro_to_cfiles(cfiles_in_generic,
                         '\n#define BLIS_CNAME_INFIX _generic\n')
+    # Listing all CMakelists.txt file from generic folder and updating them.
+    cmake_files_in_generic = execute_and_check(
+        'cd {} && dir / s / b / o: gn CMakeLists.txt'.format(
+            os.path.join(dest_path, 'generic')))
+    cmake_files_in_generic = cmake_files_in_generic.split('\r\n')
+    update_cmakelists_contents(cmake_files_in_generic, 'generic')
     cfiles_in_zen = execute_and_check('cd {} && dir / s / b / o: gn *.c'
                                       .format(os.path.join(dest_path, 'zen')))
     cfiles_in_zen = cfiles_in_zen.split('\r\n')
     add_macro_to_cfiles(cfiles_in_zen,
                         '\n#define BLIS_CNAME_INFIX _zen\n')
+    # Listing all CMakelists.txt file from zen folder and updating them.
+    cmake_files_in_zen = execute_and_check(
+        'cd {} && dir / s / b / o: gn CMakeLists.txt'.format(
+            os.path.join(dest_path, 'zen')))
+    cmake_files_in_zen = cmake_files_in_zen.split('\r\n')
+    update_cmakelists_contents(cmake_files_in_zen, 'zen')
     cfiles_in_zen2 = execute_and_check('cd {} && dir / s / b / o: gn *.c'
                                        .format(os.path.join(dest_path, 'zen2')))
     cfiles_in_zen2 = cfiles_in_zen2.split('\r\n')
     add_macro_to_cfiles(cfiles_in_zen2,
                         '\n#define BLIS_CNAME_INFIX _zen2\n')
+    # Listing all CMakelists.txt file from zen2 folder and updating them.
+    cmake_files_in_zen2 = execute_and_check(
+        'cd {} && dir / s / b / o: gn CMakeLists.txt'.format(
+            os.path.join(dest_path, 'zen2')))
+    cmake_files_in_zen2 = cmake_files_in_zen2.split('\r\n')
+    update_cmakelists_contents(cmake_files_in_zen2, 'zen2')
     cfiles_in_zen3 = execute_and_check('cd {} && dir / s / b / o: gn *.c'
                                        .format(os.path.join(dest_path, 'zen3')))
     cfiles_in_zen3 = cfiles_in_zen3.split('\r\n')
     add_macro_to_cfiles(cfiles_in_zen3,
                         '\n#define BLIS_CNAME_INFIX _zen3\n')
+    # Listing all CMakelists.txt file from zen3 folder and updating them.
+    cmake_files_in_zen3 = execute_and_check(
+        'cd {} && dir / s / b / o: gn CMakeLists.txt'.format(
+            os.path.join(dest_path, 'zen3')))
+    cmake_files_in_zen3 = cmake_files_in_zen3.split('\r\n')
+    update_cmakelists_contents(cmake_files_in_zen3, 'zen3')
     cfiles_in_zen4 = execute_and_check('cd {} && dir / s / b / o: gn *.c'
                                        .format(os.path.join(dest_path, 'zen4')))
     cfiles_in_zen4 = cfiles_in_zen4.split('\r\n')
     add_macro_to_cfiles(cfiles_in_zen4,
                         '\n#define BLIS_CNAME_INFIX _zen4\n')
+    # Listing all CMakelists.txt file from zen4 folder and updating them.
+    cmake_files_in_zen4 = execute_and_check(
+        'cd {} && dir / s / b / o: gn CMakeLists.txt'.format(
+            os.path.join(dest_path, 'zen4')))
+    cmake_files_in_zen4 = cmake_files_in_zen4.split('\r\n')
+    update_cmakelists_contents(cmake_files_in_zen4, 'zen4')
diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt
index 7429ff42ee..3a5925a306 100644
--- a/config/CMakeLists.txt
+++ b/config/CMakeLists.txt
@@ -25,4 +25,4 @@ add_subdirectory(haswell)
 else(${TARGET_ARCH} STREQUAL generic)
 message("The configuration is : ${TARGET_ARCH}")
 add_subdirectory(generic)
-endif()
\ No newline at end of file
+endif()
diff --git a/config/amdzen/bli_family_amdzen.h b/config/amdzen/bli_family_amdzen.h
index 0cf46d5a4e..7e4d460d13 100644
--- a/config/amdzen/bli_family_amdzen.h
+++ b/config/amdzen/bli_family_amdzen.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -72,6 +72,8 @@
  */
 BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
 
+BLIS_EXPORT_BLIS void bli_zen4_override_gemmt_blkszs (cntx_t* cntx);
+
 /*
  * Restore the block sizes to default values needed for zen4 context.
  *
diff --git a/config/haswell/CMakeLists.txt b/config/haswell/CMakeLists.txt
index a16f3ef51b..a43bfe2b23 100644
--- a/config/haswell/CMakeLists.txt
+++ b/config/haswell/CMakeLists.txt
@@ -18,4 +18,4 @@ if(FILES)
 
     #Install our source files
     install(FILES ${FILES} DESTINATION ${RELATIVE_PATH})
-endif()
\ No newline at end of file
+endif()
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index 9d4197712e..83ce2cf8b6 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -103,7 +103,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
     // Update the context with optimized level-1v kernels.
     bli_cntx_set_l1v_kers
     (
-      26,
+      29,
       
       // amaxv
       BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -136,6 +136,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 
       BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
       BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
+      BLIS_SCALV_KER,  BLIS_DCOMPLEX, bli_zscalv_zen_int,
 
       // swapv
       BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
@@ -144,10 +145,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
       // copyv
       BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
       BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
+      BLIS_COPYV_KER,  BLIS_DCOMPLEX, bli_zcopyv_zen_int,
 
       //set
       BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
       BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+
+      // scal2v
+      BLIS_SCAL2V_KER,  BLIS_DCOMPLEX,  bli_zscal2v_zen_int,
       cntx
     );
 
@@ -251,7 +256,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
     // Update the context with optimized small/unpacked gemm kernels.
     bli_cntx_set_l3_sup_kers
     (
-      28,
+      30,
       //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
       BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
       BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
@@ -276,9 +281,11 @@ void bli_cntx_init_zen( cntx_t* cntx )
       BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
       BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
       BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+      BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE,
       BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-      BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
       BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+      BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+      BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE,
       BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
       BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
       cntx
diff --git a/config/zen/make_defs.mk b/config/zen/make_defs.mk
index b4153fcbfb..59fc7b0a67 100644
--- a/config/zen/make_defs.mk
+++ b/config/zen/make_defs.mk
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -33,8 +33,9 @@
 #
 #
 
-# FLAGS specific to zen architecture are added here.
-# FLAGS that are common for all the AMD architectures are present in amd_config.mk
+# FLAGS that are specific to the 'zen' architecture are added here.
+# FLAGS that are common for all the AMD architectures are present in
+# config/zen/amd_config.mk.
 
 # Declare the name of the current configuration and add it to the
 # running list of configurations included by common.mk.
@@ -46,10 +47,27 @@ AMD_CONFIG_FILE := amd_config.mk
 AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
 -include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
 
+#
+# --- Determine the C compiler and related flags ---
+#
+
+# NOTE: The build system will append these variables with various
+# general-purpose/configuration-agnostic flags in common.mk. You
+# may specify additional flags here as needed.
+
+CPPROCFLAGS    :=
+CMISCFLAGS     :=
+CPICFLAGS      :=
+CWARNFLAGS     :=
+
+ifneq ($(DEBUG_TYPE),off)
+  CDBGFLAGS    := -g
+endif
+
 ifeq ($(DEBUG_TYPE),noopt)
-COPTFLAGS      := -O0
+  COPTFLAGS    := -O0
 else
-COPTFLAGS      := -O3
+  COPTFLAGS    := -O3
 endif
 
 #
@@ -61,16 +79,21 @@ endif
 # they make explicit use of the rbp register.
 CKOPTFLAGS     := $(COPTFLAGS) -fomit-frame-pointer
 ifeq ($(CC_VENDOR),gcc)
-CKVECFLAGS += -march=znver1
-endif
+  CKVECFLAGS += -march=znver1
+  GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
+
+  ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
+    CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse
+  endif
+endif# gcc
 
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
 ifeq ($(CC_VENDOR),gcc)
-CRVECFLAGS     := $(CKVECFLAGS)
+  CRVECFLAGS   := $(CKVECFLAGS)
 else
-CRVECFLAGS     := $(CKVECFLAGS)
+  CRVECFLAGS   := $(CKVECFLAGS)
 endif
 
 # Store all of the variables here to new variables containing the
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index 3ce2fced92..42eae35d95 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -3,7 +3,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -115,7 +115,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	// Update the context with optimized level-1v kernels.
 	bli_cntx_set_l1v_kers
 	(
-      26,
+	  29,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -148,6 +148,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  // scalv
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
+	  BLIS_SCALV_KER,  BLIS_DCOMPLEX, bli_zscalv_zen_int,
 
 	  //swap
 	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
@@ -156,10 +157,14 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  //copy
 	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
 	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
+	  BLIS_COPYV_KER,  BLIS_DCOMPLEX, bli_zcopyv_zen_int,
 
 	  //set
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+
+	  // scal2v
+	  BLIS_SCAL2V_KER,  BLIS_DCOMPLEX,  bli_zscal2v_zen_int,
 	  cntx
 	);
 
@@ -247,7 +252,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	// Update the context with optimized small/unpacked gemm kernels.
 	bli_cntx_set_l3_sup_kers
 	(
-      28,
+      30,
 	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
 	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
 	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
@@ -267,15 +272,17 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
 	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
 	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-      BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
+	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
 	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
 	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
 	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
 
 	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE,
 	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
 	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+	  BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE,
 	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
 	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
 	  cntx
diff --git a/config/zen2/make_defs.mk b/config/zen2/make_defs.mk
index 3b87d35b00..180c201b06 100644
--- a/config/zen2/make_defs.mk
+++ b/config/zen2/make_defs.mk
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -42,6 +42,11 @@
 THIS_CONFIG    := zen2
 #CONFIGS_INCL   += $(THIS_CONFIG)
 
+# Include file containing common flags for all AMD architectures
+AMD_CONFIG_FILE := amd_config.mk
+AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
+-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
+
 #
 # --- Determine the C compiler and related flags ---
 #
@@ -56,48 +61,68 @@ CPICFLAGS      :=
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
-CDBGFLAGS      := -g
+  CDBGFLAGS    := -g
 endif
 
 ifeq ($(DEBUG_TYPE),noopt)
-COPTFLAGS      := -O0
+  COPTFLAGS    := -O0
 else
-COPTFLAGS      := -O3
+  COPTFLAGS    := -O3
 endif
 
 # Flags specific to optimized kernels.
 # NOTE: The -fomit-frame-pointer option is needed for some kernels because
 # they make explicit use of the rbp register.
 CKOPTFLAGS     := $(COPTFLAGS) -fomit-frame-pointer
+
+# gcc or clang version must be at least 4.0
 ifeq ($(CC_VENDOR),gcc)
-GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
-#gcc or clang version must be atleast 4.0
-# gcc 9.0 or later:
-ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
-CKVECFLAGS     += -march=znver2
-else
-# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
-# as the fallback option.
-CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
-CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
-endif
-else
+  GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
+
+  ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
+    # gcc 9.0 or later
+    CKVECFLAGS += -march=znver2
+    CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse
+  else
+    # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
+    # as the fallback option.
+    CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+    CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+  endif
+endif # gcc
+
 ifeq ($(CC_VENDOR),clang)
-ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
-CKVECFLAGS += -march=znver2
-else
-#if compiling with clang
-VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
-CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
-#clang 9.0 or later:
-ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
-CKVECFLAGS += -march=znver2
-else
-CKVECFLAGS += -march=znver1
-endif
-endif
-endif
-endif
+  # AOCC clang has various formats for the version line
+
+  # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19)
+  # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12)
+  # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0)
+  # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
+  # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
+  # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0)
+
+  # For our purpose we just want to know if it version 2x or 3x or 4x
+
+  # But also set these in case we are using upstream LLVM clang
+  VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
+  CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
+
+  ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1)
+    # AOCC version 4x we will enable znver2
+    CKVECFLAGS += -march=znver2
+  else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
+    # AOCC version 3x we will enable znver2
+    CKVECFLAGS += -march=znver2
+  else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
+    # AOCC version 2x we will enable znver2
+    CKVECFLAGS += -march=znver2
+  else ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
+    # LLVM clang 9.0 or later
+    CKVECFLAGS += -march=znver2
+  else
+    CKVECFLAGS += -march=znver1
+  endif
+endif # clang
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index 779bb7277c..31a9ff5957 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -115,7 +115,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
     // Update the context with optimized level-1v kernels.
     bli_cntx_set_l1v_kers
     (
-      26,
+      29,
 
       // amaxv
       BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -148,6 +148,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
       // scalv
       BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
       BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
+      BLIS_SCALV_KER,  BLIS_DCOMPLEX, bli_zscalv_zen_int,
 
       //swap
       BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
@@ -156,10 +157,14 @@ void bli_cntx_init_zen3( cntx_t* cntx )
       //copy
       BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
       BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
+      BLIS_COPYV_KER,  BLIS_DCOMPLEX, bli_zcopyv_zen_int,
 
       //set
       BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
       BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+
+      // scal2v
+      BLIS_SCAL2V_KER,  BLIS_DCOMPLEX,  bli_zscal2v_zen_int,
       cntx
     );
 
@@ -243,7 +248,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
     // Update the context with optimized small/unpacked gemm kernels.
     bli_cntx_set_l3_sup_kers
     (
-      28,
+      30,
       //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
       BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
       BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
@@ -268,9 +273,11 @@ void bli_cntx_init_zen3( cntx_t* cntx )
       BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
       BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
       BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+      BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE,
       BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-      BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
       BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+      BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+      BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE,
       BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
       BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
       cntx
diff --git a/config/zen3/make_defs.mk b/config/zen3/make_defs.mk
index 8522a1e956..7ec1ee32e9 100644
--- a/config/zen3/make_defs.mk
+++ b/config/zen3/make_defs.mk
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -42,6 +42,11 @@
 THIS_CONFIG    := zen3
 #CONFIGS_INCL   += $(THIS_CONFIG)
 
+# Include file containing common flags for all AMD architectures
+AMD_CONFIG_FILE := amd_config.mk
+AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
+-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
+
 #
 # --- Determine the C compiler and related flags ---
 #
@@ -56,69 +61,77 @@ CPICFLAGS      :=
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
-CDBGFLAGS      := -g
+  CDBGFLAGS    := -g
 endif
 
 ifeq ($(DEBUG_TYPE),noopt)
-COPTFLAGS      := -O0
+  COPTFLAGS    := -O0
 else
-COPTFLAGS      := -O3
+  COPTFLAGS    := -O3
 endif
 
 # Flags specific to optimized kernels.
 # NOTE: The -fomit-frame-pointer option is needed for some kernels because
 # they make explicit use of the rbp register.
 CKOPTFLAGS     := $(COPTFLAGS) -fomit-frame-pointer
+
+# gcc or clang version must be at least 4.0
 ifeq ($(CC_VENDOR),gcc)
-GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
-# gcc or clang version must be atleast 4.0
-# gcc 9.0 or later:
-ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
-CKVECFLAGS     += -march=znver3
-else
-ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
-CKVECFLAGS     += -march=znver2
-else
-# If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
-# as the fallback option.
-CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
-CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
-endif # GCC 9
-endif # GCC 11
-else
+  GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
+
+  ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
+    # gcc 11.0 or later
+    CKVECFLAGS += -march=znver3
+    # Update CKOPTFLAGS for gcc to use O3 optimization without
+    # -ftree-pre and -ftree-partial-pre flag. These flag results
+    # in suboptimal code generation for instrinsic based kernels.
+    # The -ftree-loop-vectorize results in inefficient code gen
+    # for amd optimized l1 kernels based on instrinsics.
+    CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse
+  else ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
+    # gcc 9.0 or later
+    CKVECFLAGS += -march=znver2
+    CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize -fno-gcse
+  else
+    # If gcc is older than 9.1.0 but at least 6.1.0, then we can use -march=znver1
+    # as the fallback option.
+    CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+    CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+  endif
+endif # gcc
+
 ifeq ($(CC_VENDOR),clang)
+  # AOCC clang has various formats for the version line
 
-# AOCC clang has various formats for the version line
+  # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19)
+  # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12)
+  # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0)
+  # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
+  # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
+  # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0)
 
-# AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19)
-# AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12)
-# AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0)
-# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
-# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
+  # For our purpose we just want to know if it version 2x or 3x or 4x
 
-# For our prupose we just want to know if it version 2x or 3x
+  # But also set these in case we are using upstream LLVM clang
+  VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
+  CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
 
-# for version 3x we will enable znver3
-ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
-CKVECFLAGS += -march=znver3
-else
-# for version 2x we will enable znver2
-ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
-CKVECFLAGS += -march=znver2
-else
-#if compiling with clang
-VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
-CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
-#clang 9.0 or later:
-ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
-CKVECFLAGS += -march=znver2
-else
-CKVECFLAGS += -march=znver1
-endif # ge 9
-endif # aocc 2
-endif # aocc 3
+  ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1)
+    # AOCC version 4x we will enable znver3
+    CKVECFLAGS += -march=znver3
+  else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
+    # AOCC version 3x we will enable znver3
+    CKVECFLAGS += -march=znver3
+  else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
+    # AOCC version 2x we will enable znver2
+    CKVECFLAGS += -march=znver2
+  else ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
+    # LLVM clang 9.0 or later
+    CKVECFLAGS += -march=znver2
+  else
+    CKVECFLAGS += -march=znver1
+  endif
 endif # clang
-endif # gcc
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
diff --git a/config/zen4/bli_cntx_init_zen4.c b/config/zen4/bli_cntx_init_zen4.c
index ac9875abf6..8dda84ccce 100644
--- a/config/zen4/bli_cntx_init_zen4.c
+++ b/config/zen4/bli_cntx_init_zen4.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,16 +39,29 @@
  * Converted it to macro as this list is used at multiple places in this file.
  */
 
-#define BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs) \
+#define BLI_CNTX_DEFAULT_BLKSZ_LIST_GENOA(blkszs) \
     /*                                           s      d      c      z */  \
-    bli_blksz_init_easy( &blkszs[ BLIS_MR ],    32,    16,     3,     3 );  \
-    bli_blksz_init_easy( &blkszs[ BLIS_NR ],    12,    14,     8,     4 );  \
-    bli_blksz_init_easy( &blkszs[ BLIS_MC ],   512,   240,   144,    18 );  \
-    bli_blksz_init     ( &blkszs[ BLIS_KC ],   480,   512,   256,   566,    \
-                                               480,   320,   256,   566 );  \
-    bli_blksz_init_easy( &blkszs[ BLIS_NC ],  6144,  4004,  4080,   256 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_MR ],    32,    32,     3,    12 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_NR ],    12,     6,     8,     4 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_MC ],   512,   128,   144,    60 );  \
+    bli_blksz_init     ( &blkszs[ BLIS_KC ],   480,   512,   256,   512,    \
+                                               480,   320,   256,   160 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_NC ],  6144,  4002,  4080,  2004 );  \
                                                                             \
-    bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_AF ],     5,     5,    -1,    -1 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );  \
+
+
+#define BLI_CNTX_DEFAULT_BLKSZ_LIST_BERGAMO(blkszs) \
+    /*                                           s      d      c      z */  \
+    bli_blksz_init_easy( &blkszs[ BLIS_MR ],    32,    32,     3,    12 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_NR ],    12,     6,     8,     4 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_MC ],   512,    64,   144,    60 );  \
+    bli_blksz_init     ( &blkszs[ BLIS_KC ],   480,   512,   256,   512,    \
+                                               480,   320,   256,   160 );  \
+    bli_blksz_init_easy( &blkszs[ BLIS_NC ],  6144,  3600,  4080,  2004 );  \
+                                                                            \
+    bli_blksz_init_easy( &blkszs[ BLIS_AF ],     5,     5,    -1,    -1 );  \
     bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );  \
 
 
@@ -68,19 +81,21 @@ void bli_cntx_init_zen4( cntx_t* cntx )
       10,
       // gemm
       BLIS_GEMM_UKR,       BLIS_FLOAT ,   bli_sgemm_skx_asm_32x12_l2,   FALSE,
-      BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_skx_asm_16x14,      FALSE,
-      BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-      BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+      BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_zen4_asm_32x6,      FALSE,
+      BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,    TRUE,
+      /*bli_zgemm_zen4_asm_12x4 is a column preferred kernel*/
+      BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_zen4_asm_12x4,      FALSE,
 
-      BLIS_GEMM_AVX2_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-      BLIS_GEMM_AVX2_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
+      // Different  GEMM kernels are used for TRSM for zen4 architecture
+      BLIS_GEMM_FOR_TRSM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,  TRUE,
+      BLIS_GEMM_FOR_TRSM_UKR,       BLIS_DOUBLE,   bli_dgemm_zen4_asm_8x24,     TRUE,
 
       // gemmtrsm_l
       BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-      BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_zen_asm_16x14,  TRUE,
+      BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_zen4_asm_8x24,    TRUE,
       // gemmtrsm_u
       BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-      BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_zen_asm_16x14,  TRUE,
+      BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_zen4_asm_8x24,    TRUE,
 
       cntx
     );
@@ -88,7 +103,9 @@ void bli_cntx_init_zen4( cntx_t* cntx )
     // Update the context with architecture specific threshold functions
     bli_cntx_set_l3_thresh_funcs
     (
-      2,
+      3,
+      // GEMM
+      BLIS_GEMM, bli_cntx_gemmsup_thresh_is_met_zen4,
       // GEMMT
       BLIS_GEMMT, bli_cntx_gemmtsup_thresh_is_met_zen,
       // SYRK
@@ -99,15 +116,18 @@ void bli_cntx_init_zen4( cntx_t* cntx )
     // packm kernels
     bli_cntx_set_packm_kers
     (
-      8,
+      11,
       BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
       BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
       BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-      BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+      BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_zen4_asm_8xk,
+      BLIS_PACKM_24XK_KER, BLIS_DOUBLE,   bli_dpackm_zen4_asm_24xk,
+      BLIS_PACKM_32XK_KER, BLIS_DOUBLE,   bli_dpackm_zen4_asm_32xk,
       BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
       BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
       BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-      BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+      BLIS_PACKM_12XK_KER, BLIS_DCOMPLEX, bli_zpackm_zen4_asm_12xk,
+      BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_zen4_asm_4xk,
       cntx
     );
 
@@ -133,7 +153,7 @@ void bli_cntx_init_zen4( cntx_t* cntx )
     // Update the context with optimized level-1v kernels.
     bli_cntx_set_l1v_kers
     (
-      24,
+      28,
 
       // amaxv
       BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int_avx512,
@@ -146,24 +166,26 @@ void bli_cntx_init_zen4( cntx_t* cntx )
       BLIS_AXPBYV_KER, BLIS_DCOMPLEX, bli_zaxpbyv_zen_int,
 
       // axpyv
-      BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
-      BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
+      BLIS_AXPYV_KER,  BLIS_FLOAT,    bli_saxpyv_zen_int_avx512,
+      BLIS_AXPYV_KER,  BLIS_DOUBLE,   bli_daxpyv_zen_int_avx512,
       BLIS_AXPYV_KER,  BLIS_SCOMPLEX, bli_caxpyv_zen_int5,
       BLIS_AXPYV_KER,  BLIS_DCOMPLEX, bli_zaxpyv_zen_int5,
 
       // dotv
-      BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int10,
-      BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int10,
+      BLIS_DOTV_KER,   BLIS_FLOAT,    bli_sdotv_zen_int_avx512,
+      BLIS_DOTV_KER,   BLIS_DOUBLE,   bli_ddotv_zen_int_avx512,
       BLIS_DOTV_KER,   BLIS_SCOMPLEX, bli_cdotv_zen_int5,
       BLIS_DOTV_KER,   BLIS_DCOMPLEX, bli_zdotv_zen_int5,
 
       // dotxv
       BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
       BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+      BLIS_DOTXV_KER,  BLIS_DCOMPLEX, bli_zdotxv_zen_int,
 
       // scalv
-      BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
-      BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
+      BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int_avx512,
+      BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int_avx512,
+      BLIS_SCALV_KER,  BLIS_DCOMPLEX, bli_zscalv_zen_int,
 
       //swap
       BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
@@ -172,10 +194,14 @@ void bli_cntx_init_zen4( cntx_t* cntx )
       //copy
       BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
       BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
+      BLIS_COPYV_KER,  BLIS_DCOMPLEX, bli_zcopyv_zen_int,
 
       //set
       BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
       BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+
+      // scal2v
+      BLIS_SCAL2V_KER,  BLIS_DCOMPLEX,  bli_zscal2v_zen_int,
       cntx
     );
 
@@ -183,8 +209,15 @@ void bli_cntx_init_zen4( cntx_t* cntx )
     //
     // These are reference block sizes and may be overridden based on
     // number of threads used at runtime.
- 
-    BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs);
+
+    if ( bli_init_model_query_id() == BLIS_MODEL_BERGAMO )
+    {
+        BLI_CNTX_DEFAULT_BLKSZ_LIST_BERGAMO(blkszs);
+    }
+    else // BLIS_MODEL_DEFAULT choice, also currently used for BLIS_MODEL_GENOA and BLIS_MODEL_GENOA_X
+    {
+        BLI_CNTX_DEFAULT_BLKSZ_LIST_GENOA(blkszs);
+    }
 
     // Update the context with the current architecture's register and cache
     // blocksizes (and multiples) for native execution.
@@ -205,8 +238,8 @@ void bli_cntx_init_zen4( cntx_t* cntx )
     // -------------------------------------------------------------------------
 
     // Initialize sup thresholds with architecture-appropriate values. s d c z
-    bli_blksz_init_easy( &thresh[ BLIS_MT ],   512,  256,   380,   110 );
-    bli_blksz_init_easy( &thresh[ BLIS_NT ],   200,  256,   256,   128 );
+    bli_blksz_init_easy( &thresh[ BLIS_MT ],   682,  1000,   380,   110 );
+    bli_blksz_init_easy( &thresh[ BLIS_NT ],   512,  1000,   256,   128 );
     bli_blksz_init_easy( &thresh[ BLIS_KT ],   240,  220,   220,   110 );
 
     // Initialize the context with the sup thresholds.
@@ -231,48 +264,49 @@ void bli_cntx_init_zen4( cntx_t* cntx )
     // Update the context with optimized small/unpacked gemm kernels.
     bli_cntx_set_l3_sup_kers
     (
-      28,
-      //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-      BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-      BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-      BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-      BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-      BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-      BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-      BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-      BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-      BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-      BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
-      BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-      BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-      BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-      BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
-      BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-      BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+      30,
+      BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
+      BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
+      BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
+      BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
+      BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
+      BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
+      BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
+      BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_zen4_asm_24x8m, FALSE,
+      BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
+      BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64m_avx512, TRUE,
+      BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
+      BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
+      BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64m_avx512, TRUE,
+      BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x64n_avx512, TRUE,
+      BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
+      BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x64n_avx512, TRUE,
       BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
       BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
       BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
       BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
       BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
       BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-      BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-      BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-      BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-      BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-      BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-      BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+      BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
+      BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
+      BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
+      BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
+      BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
+      BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
+      BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
+      BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_cv_zen4_asm_12x4m, FALSE,
       cntx
     );
 
     // Initialize level-3 sup blocksize objects with architecture-specific
     // values.
     //                                           s      d      c      z
-    bli_blksz_init     ( &blkszs[ BLIS_MR ],    6,     6,     3,      3,
-                                                9,     9,     3,      3    );
-    bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,    8,     8,      4    );
-    bli_blksz_init_easy( &blkszs[ BLIS_MC ],    144,   72,    72,     36   );
-    bli_blksz_init_easy( &blkszs[ BLIS_KC ],    512,   256,   128,    64   );
-    bli_blksz_init_easy( &blkszs[ BLIS_NC ],    8160,  4080,  2040,   1020 );
+    bli_blksz_init     ( &blkszs[ BLIS_MR ],    6,     24,    3,      12,
+                                                6,     9,     3,      12   );
+    bli_blksz_init_easy( &blkszs[ BLIS_NR ],    64,    8,     8,      4    );
+    bli_blksz_init_easy( &blkszs[ BLIS_MC ],    192,   144,   72,     48   );
+    bli_blksz_init_easy( &blkszs[ BLIS_KC ],    512,   480,   128,    64   );
+    bli_blksz_init_easy( &blkszs[ BLIS_NC ],    8064,  4080,  2040,   1020 );
 
     // Update the context with the current architecture's register and cache
     // blocksizes for small/unpacked level-3 problems.
@@ -292,21 +326,21 @@ void bli_cntx_init_zen4( cntx_t* cntx )
  * Override the block sizes in the context to the block sizes used
  * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
  * GEMM kernels are AVX512 based and uses different block sizes.
- * 
+ *
  * This function should be called in TRSM path before performing
- * any packing operations. 
- * 
- * Also the context must be restored to default values by calling 
+ * any packing operations.
+ *
+ * Also the context must be restored to default values by calling
  * bli_zen4_restore_default_blkszs() before exiting TRSM Path
  */
 void bli_zen4_override_trsm_blkszs (cntx_t* cntx)
 {
     blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-    bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,     16,     3,     3 );
-    bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     14,     8,     4 );
-    bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    240,   144,    72 );
+    bli_blksz_init_easy( &blkszs[ BLIS_MR ],     6,      8,     3,     3 );
+    bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     24,     8,     4 );
+    bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    120,   144,    72 );
     bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,    512,   256,   256 );
-    bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,   4004,  4080,  4080 );
+    bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,   4008,  4080,  4080 );
 
 
     // Update the context with the current architecture's register and cache
@@ -324,21 +358,88 @@ void bli_zen4_override_trsm_blkszs (cntx_t* cntx)
     );
 }
 
+
+// Since the output of syrk/gemmt is a triangular matrix,
+// near-to-square shaped kernel performs better than 
+// skewed/rectangular shaped kernel.
+// Hence we are overriding blocksizes and kernel
+// function pointers for gemmt/syrk with avx2 specific ones
+void bli_zen4_override_gemmt_blkszs (cntx_t* cntx)
+{
+    blksz_t blkszs[ BLIS_NUM_BLKSZS ];
+
+    bli_blksz_init     ( &blkszs[ BLIS_MR ],    6,     6,     3,      3,
+                                                9,     9,     3,      3    );
+    bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,    8,     8,      4    );
+    bli_blksz_init_easy( &blkszs[ BLIS_MC ],    144,   72,    72,     36   );
+    bli_blksz_init_easy( &blkszs[ BLIS_KC ],    512,   256,   128,    64   );
+    bli_blksz_init_easy( &blkszs[ BLIS_NC ],    8160,  4080,  2040,   1020 );
+    // Update the context with the current architecture's register and cache
+    // blocksizes (and multiples) for native execution.
+    bli_cntx_set_l3_sup_blkszs
+    (
+      4,
+      // level-3
+      BLIS_KC, &blkszs[ BLIS_KC ],
+      BLIS_MC, &blkszs[ BLIS_MC ],
+      BLIS_NR, &blkszs[ BLIS_NR ],
+      BLIS_MR, &blkszs[ BLIS_MR ],
+      cntx
+    );
+
+    bli_cntx_set_l3_sup_kers
+    (
+      24,
+      BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+      BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
+      BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+      BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+      BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
+      BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
+      BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+      BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
+      BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+      BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
+      BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+      BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+      BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
+      BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
+      BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+      BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
+      BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+      BLIS_RRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4m, TRUE,
+      BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+      BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+      BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
+      BLIS_CRC, BLIS_DCOMPLEX, bli_zgemmsup_rd_zen_asm_3x4n, TRUE,
+      BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+      BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
+      cntx
+    );
+}
+
 /*
  * Restore the block sizes to default values needed for zen4 context.
  *
  * This function should be called to restore the block sizes to there
  * default values if they where overriden by calling
- * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the 
+ * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
  * TRSM path.
- * 
+ *
  */
 void bli_zen4_restore_default_blkszs (cntx_t* cntx)
 {
     blksz_t blkszs[ BLIS_NUM_BLKSZS ];
 
-    BLI_CNTX_DEFAULT_BLKSZ_LIST(blkszs);
-    
+    if ( bli_init_model_query_id() == BLIS_MODEL_BERGAMO )
+    {
+        BLI_CNTX_DEFAULT_BLKSZ_LIST_BERGAMO(blkszs);
+    }
+    else // BLIS_MODEL_DEFAULT choice, also currently used for BLIS_MODEL_GENOA and BLIS_MODEL_GENOA_X
+    {
+        BLI_CNTX_DEFAULT_BLKSZ_LIST_GENOA(blkszs);
+    }
+
     // Update the context with the current architecture's register and cache
     // blocksizes (and multiples) for native execution.
     bli_cntx_set_blkszs
diff --git a/config/zen4/bli_family_zen4.h b/config/zen4/bli_family_zen4.h
index b21d1582f7..a1666ea9d3 100644
--- a/config/zen4/bli_family_zen4.h
+++ b/config/zen4/bli_family_zen4.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -64,23 +64,25 @@
  * Override the block sizes in the context to the block sizes used
  * by AVX2 GEMM+TRSM kernels, this is needed in Zen4 context as default
  * GEMM kernels are AVX512 based and uses different block sizes.
- * 
+ *
  * This function should be called in TRSM path before performing
- * any packing operations. 
- * 
- * Also the context must be restored to default values by calling 
+ * any packing operations.
+ *
+ * Also the context must be restored to default values by calling
  * bli_zen4_restore_default_blkszs() before exiting TRSM Path
  */
 BLIS_EXPORT_BLIS void bli_zen4_override_trsm_blkszs (cntx_t* cntx);
 
+BLIS_EXPORT_BLIS void bli_zen4_override_gemmt_blkszs (cntx_t* cntx);
+
 /*
  * Restore the block sizes to default values needed for zen4 context.
  *
  * This function should be called to restore the block sizes to there
  * default values if they where overriden by calling
- * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the 
+ * bli_zen4_override_trsm_blkszs() to enable AVX2 GEMM kernels in the
  * TRSM path.
- * 
+ *
  */
 BLIS_EXPORT_BLIS void bli_zen4_restore_default_blkszs (cntx_t* cntx);
 
diff --git a/config/zen4/make_defs.mk b/config/zen4/make_defs.mk
index 062e680910..5a058e2fbc 100644
--- a/config/zen4/make_defs.mk
+++ b/config/zen4/make_defs.mk
@@ -4,7 +4,7 @@
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
-#  Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -41,6 +41,11 @@
 THIS_CONFIG    := zen4
 #CONFIGS_INCL   += $(THIS_CONFIG)
 
+# Include file containing common flags for all AMD architectures
+AMD_CONFIG_FILE := amd_config.mk
+AMD_CONFIG_PATH := $(BASE_SHARE_PATH)/config/zen
+-include $(AMD_CONFIG_PATH)/$(AMD_CONFIG_FILE)
+
 #
 # --- Determine the C compiler and related flags ---
 #
@@ -55,105 +60,105 @@ CPICFLAGS      :=
 CWARNFLAGS     :=
 
 ifneq ($(DEBUG_TYPE),off)
-CDBGFLAGS      := -g
+  CDBGFLAGS    := -g
 endif
 
 ifeq ($(DEBUG_TYPE),noopt)
-COPTFLAGS      := -O0
+  COPTFLAGS    := -O0
 else
-COPTFLAGS      := -O3
+  COPTFLAGS    := -O3
 endif
 
 # Flags specific to optimized kernels.
 # NOTE: The -fomit-frame-pointer option is needed for some kernels because
 # they make explicit use of the rbp register.
 CKOPTFLAGS     := $(COPTFLAGS) -fomit-frame-pointer
-ifeq ($(CC_VENDOR),gcc)
-GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
-
 
-# gcc 11.0 or later:
-ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
-# Update CKOPTFLAGS for gcc 11+ to use O3 optimization without
-# -ftree-partial-pre flag. This flag results in suboptimal code
-# generation for instrinsics based kernels.
-ifneq ($(DEBUG_TYPE),noopt)
-CKOPTFLAGS     := -O2 -fgcse-after-reload -fipa-cp-clone -floop-interchange -floop-unroll-and-jam -fpeel-loops -fpredictive-commoning -fsplit-loops -fsplit-paths -ftree-loop-distribution -funswitch-loops -fvect-cost-model=dynamic -fversion-loops-for-strides -fomit-frame-pointer
-endif
+# gcc or clang version must be at least 4.0
+ifeq ($(CC_VENDOR),gcc)
+  GCC_VERSION := $(strip $(shell $(CC) -dumpversion | cut -d. -f1))
+
+  ifeq ($(shell test $(GCC_VERSION) -ge 13; echo $$?),0)
+    # gcc 13.0 or later
+    CKVECFLAGS += -march=znver4
+    CRVECFLAGS += -march=znver4
+    # Update CKOPTFLAGS for gcc to use O3 optimization without
+    # -ftree-pre and -ftree-partial-pre flag. These flag results
+    # in suboptimal code generation for instrinsic based kernels.
+    # The -ftree-loop-vectorize results in inefficient code gen
+    # for amd optimized l1 kernels based on instrinsics.
+    CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
+  else ifeq ($(shell test $(GCC_VERSION) -ge 11; echo $$?),0)
+    # gcc 11.0 or later
+    CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16
+    CRVECFLAGS += -march=znver3
+    CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
+  else ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
+    # gcc 9.0 or later
+    CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni
+    CRVECFLAGS += -march=znver2
+    CKOPTFLAGS += -fno-tree-partial-pre -fno-tree-pre -fno-tree-loop-vectorize
+  else ifeq ($(shell test $(GCC_VERSION) -ge 8; echo $$?),0)
+    # gcc 8.0 or later
+    CKVECFLAGS += -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni
+    CRVECFLAGS += -march=znver1
+  else
+    # If gcc is older than 8.0.0 but at least 6.1.0, then we can use -march=znver1
+    # as the fallback option.
+    CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+    CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
+  endif
+endif # gcc
 
-CKVECFLAGS     +=  -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -mfpmath=sse
-CRVECFLAGS     +=  -march=znver3
-else
-# gcc 9.0 or later:
-ifeq ($(shell test $(GCC_VERSION) -ge 9; echo $$?),0)
-CKVECFLAGS     +=  -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
-CRVECFLAGS     +=  -march=znver2
-else
-ifeq ($(shell test $(GCC_VERSION) -ge 8; echo $$?),0)
-CKVECFLAGS     +=  -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
-CRVECFLAGS     +=  -march=znver1
-else
-# If gcc is older than 8.0.0 but at least 6.1.0, then we can use -march=znver1
-# as the fallback option.
-CKVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
-CRVECFLAGS += -march=znver1 -mno-avx256-split-unaligned-store
-endif # GCC 8
-endif # GCC 9
-endif # GCC 11
-else
 ifeq ($(CC_VENDOR),clang)
-
-# AOCC clang has various formats for the version line
-
-# AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19)
-# AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12)
-# AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0)
-# AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
-# AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
-# AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0)
-
-# For our prupose we just want to know if it version 2x or 3x or 4x
-
-# for version 4x we will enable znver4
-ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1)
-CKVECFLAGS += -march=znver4 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512bf16 -mfpmath=sse
-CRVECFLAGS += -march=znver4
-else
-# for version 3x we will enable znver3
-ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
-CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -mfpmath=sse
-CRVECFLAGS += -march=znver3
-else
-# for version 2x we will enable znver2
-ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
-CKVECFLAGS += -march=znver2  -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mfpmath=sse
-CRVECFLAGS += -march=znver2
-else
-#if compiling with clang
-VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
-CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
-#clang 9.0 or later:
-ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
-CKVECFLAGS += -march=znver2
-CRVECFLAGS += -march=znver2
-else
-CKVECFLAGS += -march=znver1
-CRVECFLAGS += -march=znver1
-endif # ge 9
-endif # aocc 2
-endif # aocc 3
-endif # aocc 4
+  # AOCC clang has various formats for the version line
+
+  # AOCC.LLVM.2.0.0.B191.2019_07_19 clang version 8.0.0 (CLANG: Jenkins AOCC_2_0_0-Build#191) (based on LLVM AOCC.LLVM.2.0.0.B191.2019_07_19)
+  # AOCC.LLVM.2.1.0.B1030.2019_11_12 clang version 9.0.0 (CLANG: Build#1030) (based on LLVM AOCC.LLVM.2.1.0.B1030.2019_11_12)
+  # AMD clang version 10.0.0 (CLANG: AOCC_2.2.0-Build#93 2020_06_25) (based on LLVM Mirror.Version.10.0.0)
+  # AMD clang version 11.0.0 (CLANG: AOCC_2.3.0-Build#85 2020_11_10) (based on LLVM Mirror.Version.11.0.0)
+  # AMD clang version 12.0.0 (CLANG: AOCC_3.0.0-Build#2 2020_11_05) (based on LLVM Mirror.Version.12.0.0)
+  # AMD clang version 14.0.0 (CLANG: AOCC_4.0.0-Build#98 2022_06_15) (based on LLVM Mirror.Version.14.0.0)
+
+  # For our purpose we just want to know if it version 2x or 3x or 4x
+
+  # But also set these in case we are using upstream LLVM clang
+  VENDOR_STRING := $(strip $(shell ${CC_VENDOR} --version | egrep -o '[0-9]+\.[0-9]+\.?[0-9]*'))
+  CC_MAJOR := $(shell (echo ${VENDOR_STRING} | cut -d. -f1))
+
+  ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_4')),1)
+    # AOCC version 4x we will enable znver4
+    CKVECFLAGS += -march=znver4 -falign-loops=64
+    CRVECFLAGS += -march=znver4
+  else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC_3')),1)
+    # AOCC version 3x we will enable znver3
+    CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64
+    CRVECFLAGS += -march=znver3
+  else ifeq ($(strip $(shell $(CC) -v |&head -1 |grep -c 'AOCC.LLVM.2\|AOCC_2')),1)
+    # AOCC version 2x we will enable znver2
+    CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni
+    CRVECFLAGS += -march=znver2
+  else ifeq ($(shell test $(CC_MAJOR) -ge 16; echo $$?),0)
+    # LLVM clang 16.0 or later
+    CKVECFLAGS += -march=znver4 -falign-loops=64
+    CRVECFLAGS += -march=znver4
+  else ifeq ($(shell test $(CC_MAJOR) -ge 13; echo $$?),0)
+    # LLVM clang 13.0 or later
+    CKVECFLAGS += -march=znver3 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64
+    CRVECFLAGS += -march=znver3
+  else ifeq ($(shell test $(CC_MAJOR) -ge 9; echo $$?),0)
+    # LLVM clang 9.0 or later
+    CKVECFLAGS += -march=znver2 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -mavx512bf16 -falign-loops=64
+    CRVECFLAGS += -march=znver2
+  else
+    CKVECFLAGS += -march=znver1 -mavx512f -mavx512dq -mavx512bw -mavx512vl -mavx512vnni -falign-loops=64
+    CRVECFLAGS += -march=znver1
+  endif
 endif # clang
-endif # gcc
 
 # Flags specific to reference kernels.
 CROPTFLAGS     := $(CKOPTFLAGS)
-
-# Flags specific to reference kernels.
-# Note: We use AVX2 for reference kernels because, as Jeff Hammond says,
-# reference kernel code "is not going to achieve high enough SIMD utilization
-# to overcome the AVX-512 frequency drop". (Issue #187)
-CRVECFLAGS     += -mno-avx512f -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -funsafe-math-optimizations -ffp-contract=fast
+CRVECFLAGS     := $(CKVECFLAGS)
 
 # Store all of the variables here to new variables containing the
 # configuration name.
diff --git a/configure b/configure
index 73dc8cc358..a165c1ad51 100755
--- a/configure
+++ b/configure
@@ -5,7 +5,7 @@
 #  libraries.
 #
 #  Copyright (C) 2014, The University of Texas at Austin
-#  Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
+#  Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions are
@@ -215,7 +215,7 @@ print_usage()
 	echo " "
 	echo "                 Set the size (in bits) of internal BLIS integers and"
 	echo "                 integer types used in native BLIS interfaces. The"
-	echo "                 default inteter type size is architecture dependent."
+	echo "                 default integer type size is architecture dependent."
 	echo "                 (Hint: You can always find this value printed at the"
 	echo "                 beginning of the testsuite output.)"
 	echo " "
@@ -355,17 +355,22 @@ print_usage()
 	echo " "
 	echo "   --enable-blis-arch-type, --disable-blis-arch-type"
 	echo " "
-	echo "                 Disable (Enabled by default) support for BLIS_ARCH_TYPE"
-	echo "                 environment variable, which allows user to select"
-	echo "                 architecture-specific code path at runtime."
+	echo "                 Disable (Enabled by default) support for BLIS_ARCH_TYPE and BLIS_MODEL_TYPE"
+	echo "                 environment variables, which allows user to select"
+	echo "                 architecture specific code path and optimizations at runtime."
 	echo "                 If disabled, in builds with multiple code paths, BLIS"
-	echo "                 will still select path automatically."
+	echo "                 will still select path and optimizations automatically."
 	echo " "
 	echo "   --rename-blis-arch-type=STRING"
 	echo " "
-	echo "                 Change environment variable used to select architecture-specific"
+	echo "                 Change environment variable used to select architecture specific"
 	echo "                 code path from BLIS_ARCH_TYPE to STRING"
 	echo " "
+	echo "   --rename-blis-model-type=STRING"
+	echo " "
+	echo "                 Change environment variable used to select architecture model specific"
+	echo "                 optimizations from BLIS_MODEL_TYPE to STRING"
+	echo " "
 	echo "   -q, --quiet   Suppress informational output. By default, configure"
 	echo "                 is verbose. (NOTE: -q is not yet implemented)"
 	echo " "
@@ -1163,6 +1168,7 @@ auto_detect()
 	cmd="${cc} ${config_defines} \
 	      -DBLIS_CONFIGURETIME_CPUID \
 	      -D__blis_arch_type_name=${double_quote_open}${rename_blis_arch_type}${double_quote_close} \
+	      -D__blis_model_type_name=${double_quote_open}${rename_blis_model_type}${double_quote_close} \
 	      ${c_hdr_paths} \
 	      -std=c99 -D_GNU_SOURCE \
 	      ${cflags} \
@@ -2043,6 +2049,7 @@ main()
 	complex_return='default'
 	disable_blis_arch_type='no'
 	rename_blis_arch_type='BLIS_ARCH_TYPE'
+	rename_blis_model_type='BLIS_MODEL_TYPE'
 
 	# The addon flag and names.
 	addon_flag=''
@@ -2281,6 +2288,9 @@ main()
 						rename-blis-arch-type=*)
 							rename_blis_arch_type=${OPTARG#*=}
 							;;
+						rename-blis-model-type=*)
+							rename_blis_model_type=${OPTARG#*=}
+							;;
 						*)
 							print_usage
 							;;
@@ -3076,13 +3086,6 @@ main()
 		echo "${script_name}: compiler appears to not support #pragma omp simd."
 		enable_pragma_omp_simd_01=0
 	fi
-	if [ "x${enable_blas}" = "xyes" ]; then
-		echo "${script_name}: the BLAS compatibility layer is enabled."
-		enable_blas_01=1
-	else
-		echo "${script_name}: the BLAS compatibility layer is disabled."
-		enable_blas_01=0
-	fi
 	if [ "x${enable_cblas}" = "xyes" ]; then
 		echo "${script_name}: the CBLAS compatibility layer is enabled."
 		enable_cblas_01=1
@@ -3092,6 +3095,13 @@ main()
 		echo "${script_name}: the CBLAS compatibility layer is disabled."
 		enable_cblas_01=0
 	fi
+	if [ "x${enable_blas}" = "xyes" ]; then
+		echo "${script_name}: the BLAS compatibility layer is enabled."
+		enable_blas_01=1
+	else
+		echo "${script_name}: the BLAS compatibility layer is disabled."
+		enable_blas_01=0
+	fi
 	if [ "x${enable_mixed_dt}" = "xyes" ]; then
 		echo "${script_name}: mixed datatype support is enabled."
 
@@ -3257,7 +3267,7 @@ main()
 	fi
 
 	if [ "x${disable_blis_arch_type}" = "xyes"  ]; then
-		echo "${script_name}: user selection of code path using BLIS_ARCH_TYPE env var is disabled."
+		echo "${script_name}: user selection of code path using BLIS_ARCH_TYPE and BLIS_MODEL_TYPE env vars is disabled."
 		disable_blis_arch_type_01='1'
 	else
 		disable_blis_arch_type_01='0'
@@ -3267,6 +3277,10 @@ main()
 	if [ "x${rename_blis_arch_type}" != "xBLIS_ARCH_TYPE" ]; then
 		echo "${script_name}: configuring with BLIS_ARCH_TYPE env var renamed to '${rename_blis_arch_type}'."
 	fi
+	# Check if the user requested a custom env var name to replace BLIS_MODEL_TYPE.
+	if [ "x${rename_blis_model_type}" != "xBLIS_MODEL_TYPE" ]; then
+		echo "${script_name}: configuring with BLIS_MODEL_TYPE env var renamed to '${rename_blis_model_type}'."
+	fi
 
 	echo "${script_name}: configuring complex return type as \"${complex_return}\"."
 
@@ -3482,6 +3496,7 @@ main()
 		| sed   -e "s/@complex_return_intel@/${complex_return_intel01}/g" \
 		| sed   -e "s/@disable_blis_arch_type@/${disable_blis_arch_type_01}/g" \
 		| sed   -e "s/@rename_blis_arch_type@/${rename_blis_arch_type}/g" \
+		| sed   -e "s/@rename_blis_model_type@/${rename_blis_model_type}/g" \
 		> "${bli_config_h_out_path}"
 
 	# -- Instantiate bli_addon.h file from template ----------------------------
diff --git a/docs/BLISTypedAPI.md b/docs/BLISTypedAPI.md
index 7d6e92edac..e495aa00a8 100644
--- a/docs/BLISTypedAPI.md
+++ b/docs/BLISTypedAPI.md
@@ -1891,7 +1891,7 @@ Possible microkernel types (ie: the return values for `bli_info_get_*_ukr_impl_s
 
 ### Operation implementation type query
 
-The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implemenation query](BLISTypedAPI.md#microkernel-implementation-type-query).
+The following routines allow the caller to obtain a string that identifies the implementation (`ind_t`) that is currently active (ie: implemented and enabled) for each level-3 operation. Possible implementation types are listed in the section above covering [microkernel implementation query](BLISTypedAPI.md#microkernel-implementation-type-query).
 ```c
 char* bli_info_get_gemm_impl_string( num_t dt );
 char* bli_info_get_hemm_impl_string( num_t dt );
diff --git a/docs/Doxyfile b/docs/Doxyfile
new file mode 100644
index 0000000000..36ae286238
--- /dev/null
+++ b/docs/Doxyfile
@@ -0,0 +1,2842 @@
+# Doxyfile 1.9.6
+
+# Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = AOCL-BLIS
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = ./styling/AMD_Logo.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = ./
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:^^"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
+
+CASE_SENSE_NAMES       = SYSTEM
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = ../
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding
+# "INPUT_ENCODING" for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.l \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f18 \
+                         *.f \
+                         *.for \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.ice
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = ../addon \
+                         ../aocl_dtl \
+                         ../bench \
+                         ../blastest \
+                         ../build \
+                         ../config \
+                         ../examples \
+                         ../include \
+			 ../gtestsuite \
+                         ../kernels \
+                         ../lib \
+                         ../mpi_test \
+                         ../ref_kernels \
+                         ../sandbox \
+                         ../test \
+                         ../testsuite \
+                         ../travis \
+                         ../vendor \
+                         ../windows \
+                         ../frame/0 \
+                         ../frame/1 \
+                         ../frame/1d \
+                         ../frame/1f \
+                         ../frame/1m \
+                         ../frame/2 \
+                         ../frame/3 \
+                         ../frame/base \
+                         ../frame/include \
+                         ../frame/ind \
+                         ../frame/thread \
+                         ../frame/util \
+                         ../bli_addon.h \
+                         ../bli_config.h \
+                         ../configure \
+                         ../CONTRIBUTING.md \
+                         ../INSTALL \
+                         ../LICENSE \
+                         ../Makefile \
+                         ../README.md \
+                         ../RELEASING \
+                         ../docs/Addons.md \
+                         ../docs/BLISObjectAPI.md \
+                         ../docs/BLISTypedAPI.md \
+                         ../docs/BuildSystem.md \
+                         ../docs/CodingConventions.md \
+                         ../docs/ConfigurationHowTo.md \
+                         ../docs/Doxyfile \
+                         ../docs/FAQ.md \
+                         ../docs/HardwareSupport.md \
+                         ../docs/KernelsHowTo.md \
+                         ../docs/MixedDatatypes.md \
+                         ../docs/Multithreading.md \
+                         ../docs/Performance.md \
+                         ../docs/PerformanceSmall.md \
+                         ../docs/ReleaseNotes.md \
+                         ../docs/Sandboxes.md \
+                         ../docs/Testsuite.md
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# ANamespace::AClass, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see:
+# http://clang.llvm.org/) for more accurate parsing at the cost of reduced
+# performance. This can be particularly helpful with template rich C++ code for
+# which doxygen's built-in parser lacks the necessary type information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = NO
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES and the CLANG_ADD_INC_PATHS
+# tag is set to YES then doxygen will add the directory of each input to the
+# include path.
+# The default value is: YES.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_ADD_INC_PATHS    = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          =
+
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the directory containing a file called compile_commands.json. This
+# file is the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the
+# options used when the source files were built. This is equivalent to
+# specifying the -p option to a clang tool, such as clang-check. These options
+# will then be passed to the parser. Any options specified with CLANG_OPTIONS
+# will be added as well.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = ./styling/header.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = ./styling/footer.html
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = ./styling/doxygen-awesome.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = ./styling/AMD_Logo.png \
+                         ./styling/doxygen-fragment-copy-button.js \
+                         ./styling/doxygen-interactive-toc.js
+
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generate light mode output, DARK always
+# generate dark mode output, AUTO_LIGHT automatically set the mode according to
+# the user preference, use light mode if no preference is set (the default),
+# AUTO_DARK automatically set the mode according to the user preference, use
+# dark mode if no preference is set and TOGGLE allow to user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = AUTO_LIGHT
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = YES
+
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        =
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         =
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
+#
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to Sqlite3 output
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = BLIS_ENABLE_CBLAS \
+                         OF(x)=x \
+			 BLIS_EXPORT_BLAS
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
+
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
+
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies. See also the chapter Grouping
+# in the manual.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
+# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
+# The default value is: YES.
+
+DOT_CLEANUP            = YES
diff --git a/docs/Main_Page.md b/docs/Main_Page.md
new file mode 100644
index 0000000000..39c2e12c85
--- /dev/null
+++ b/docs/Main_Page.md
@@ -0,0 +1,138 @@
+@mainpage
+# Welcome to AOCL-BLIS
+
+---
+
+## Table of Content
+    * [Introduction](#Introduction)
+    * [Build and Installation](#Build)
+    * [Examples](#Example)
+    * [Contact Us](#Contact)
+
+
+<div id="Introduction" name="Introduction"></div>
+
+## Introduction
+
+<b> AOCL BLIS </b> BLIS is a portable software framework for instantiating high-performance BLAS-like dense linear algebra libraries. The framework was designed to isolate essential kernels of computation that, when optimized, immediately enable optimized implementations of most of its commonly used and computationally intensive operations. BLIS is written in ISO C99 and available under a new/modified/3-clause BSD license. While BLIS exports a new BLAS-like API, it also includes a BLAS compatibility layer which gives application developers access to BLIS implementations via traditional BLAS routine calls. An object-based API unique to BLIS is also available.
+
+How to Download BLIS
+--------------------
+
+There are a few ways to download BLIS. We list the most common four ways below.
+We **highly recommend** using either Option 1 or 2. Otherwise, we recommend
+Option 3 (over Option 4) so your compiler can perform optimizations specific
+to your hardware.
+
+1. **Download a source repository with `git clone`.**
+Generally speaking, we prefer using `git clone` to clone a `git` repository.
+Having a repository allows the user to periodically pull in the latest changes
+and quickly rebuild BLIS whenever they wish. Also, implicit in cloning a
+repository is that the repository defaults to using the `master` branch, which
+contains the latest "stable" commits since the most recent release. (This is
+in contrast to Option 3 in which the user is opting for code that may be
+slightly out of date.)
+
+   In order to clone a `git` repository of BLIS, please obtain a repository
+URL by clicking on the green button above the file/directory listing near the
+top of this page (as rendered by GitHub). Generally speaking, it will amount
+to executing the following command in your terminal shell:
+   ```
+   git clone https://github.com/amd/blis.git
+   ```
+
+2. **Download a source repository via a zip file.**
+If you are uncomfortable with using `git` but would still like the latest
+stable commits, we recommend that you download BLIS as a zip file.
+
+   In order to download a zip file of the BLIS source distribution, please
+click on the green button above the file listing near the top of this page.
+This should reveal a link for downloading the zip file.
+
+3. **Download a source release via a tarball/zip file.**
+Alternatively, if you would like to stick to the code that is included in
+official releases, you may download either a tarball or zip file of any of
+BLIS's previous [tagged releases](https://github.com/flame/blis/releases).
+We consider this option to be less than ideal for most people since it will
+likely mean you miss out on the latest bugfix or feature commits (in contrast
+to Options 1 or 2), and you also will not be able to update your code with a
+simple `git pull` command (in contrast to Option 1).
+
+4. **Download a binary package specific to your OS.**
+While we don't recommend this as the first choice for most users, we provide
+links to community members who generously maintain BLIS packages for various
+Linux distributions such as Debian Unstable and EPEL/Fedora. Please see the
+[External Packages](#external-packages) section below for more information.
+
+Getting Started
+---------------
+
+*NOTE: This section assumes you've either cloned a BLIS source code repository
+via `git`, downloaded the latest source code via a zip file, or downloaded the
+source code for a tagged version release---Options 1, 2, or 3, respectively,
+as discussed in [the previous section](#how-to-download-blis).*
+
+If you just want to build a sequential (not parallelized) version of BLIS
+in a hurry and come back and explore other topics later, you can configure
+and build BLIS as follows:
+```
+$ ./configure auto
+$ make [-j]
+```
+You can then verify your build by running BLAS- and BLIS-specific test
+drivers via `make check`:
+```
+$ make check [-j]
+```
+And if you would like to install BLIS to the directory specified to `configure`
+via the `--prefix` option, run the `install` target:
+```
+$ make install
+```
+Please read the output of `./configure --help` for a full list of configure-time
+options.
+If/when you have time, we *strongly* encourage you to read the detailed
+walkthrough of the build system found in our [Build System](docs/BuildSystem.md)
+guide.
+
+Example Code
+------------
+
+The BLIS source distribution provides example code in the `examples` directory.
+Example code focuses on using BLIS APIs (not BLAS or CBLAS), and resides in
+two subdirectories: [examples/oapi](examples/oapi) (which demonstrates the
+[object API](docs/BLISObjectAPI.md)) and [examples/tapi](examples/tapi) (which
+demonstrates the [typed API](docs/BLISTypedAPI.md)).
+
+Either directory contains several files, each containing various pieces of
+code that exercise core functionality of the BLIS API in question (object or
+typed). These example files should be thought of collectively like a tutorial,
+and therefore it is recommended to start from the beginning (the file that
+starts in `00`).
+
+You can build all of the examples by simply running `make` from either example
+subdirectory (`examples/oapi` or `examples/tapi`). (You can also run
+`make clean`.) The local `Makefile` assumes that you've already configured and
+built (but not necessarily installed) BLIS two directories up, in `../..`. If
+you have already installed BLIS to some permanent directory, you may refer to
+that installation by setting the environment variable `BLIS_INSTALL_PATH` prior
+to running make:
+```
+export BLIS_INSTALL_PATH=/usr/local; make
+```
+or by setting the same variable as part of the make command:
+```
+make BLIS_INSTALL_PATH=/usr/local
+```
+**Once the executable files have been built, we recommend reading the code and
+the corresponding executable output side by side. This will help you see the
+effects of each section of code.**
+
+This tutorial is not exhaustive or complete; several object API functions were
+omitted (mostly for brevity's sake) and thus more examples could be written.
+
+<div id = "Contact"></div>
+
+## CONTACTS
+
+AOCL BLIS is developed and maintained by AMD. You can contact us on the email-id <b>[aoclsupport@amd.com](mailto:aoclsupport@amd.com)</b>
diff --git a/docs/styling/AMD_Logo.png b/docs/styling/AMD_Logo.png
new file mode 100644
index 0000000000..099dd266cf
Binary files /dev/null and b/docs/styling/AMD_Logo.png differ
diff --git a/docs/styling/doxygen-awesome.css b/docs/styling/doxygen-awesome.css
new file mode 100644
index 0000000000..21ca497d34
--- /dev/null
+++ b/docs/styling/doxygen-awesome.css
@@ -0,0 +1,1504 @@
+/**
+
+Doxygen Awesome
+https://github.com/jothepro/doxygen-awesome-css
+
+MIT License
+
+Copyright (c) 2021 jothepro
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+html {
+    /* primary theme color. This will affect the entire websites color scheme: links, arrows, labels, ... */
+    --primary-color: #1779c4;
+    --primary-dark-color: #00559f;
+    --primary-light-color: #7aabd6;
+    --primary-lighter-color: #cae1f1;
+    --primary-lightest-color: #e9f1f8;
+
+    /* page base colors */
+    --page-background-color: white;
+    --page-foreground-color: #2c3e50;
+    --page-secondary-foreground-color: #67727e;
+
+    /* color for all separators on the website: hr, borders, ... */
+    --separator-color: #dedede;
+
+    /* border radius for all rounded components. Will affect many components, like dropdowns, memitems, codeblocks, ... */
+    --border-radius-large: 8px;
+    --border-radius-small: 4px;
+    --border-radius-medium: 6px;
+
+    /* default spacings. Most components reference these values for spacing, to provide uniform spacing on the page. */
+    --spacing-small: 5px;
+    --spacing-medium: 10px;
+    --spacing-large: 16px;
+
+    /* default box shadow used for raising an element above the normal content. Used in dropdowns, Searchresult, ... */
+    --box-shadow: 0 2px 10px 0 rgba(0,0,0,.1);
+
+    --odd-color: rgba(0,0,0,.03);
+
+    /* font-families. will affect all text on the website
+     * font-family: the normal font for text, headlines, menus
+     * font-family-monospace: used for preformatted text in memtitle, code, fragments
+     */
+    --font-family: -apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen,Ubuntu,Cantarell,Fira Sans,Droid Sans,Helvetica Neue,sans-serif;
+    --font-family-monospace: source-code-pro,Menlo,Monaco,Consolas,Courier New,monospace;
+
+    /* font sizes */
+    --page-font-size: 15.6px;
+    --navigation-font-size: 14.4px;
+    --code-font-size: 14.4px; /* affects code, fragment */
+    --title-font-size: 22px;
+
+    /* content text properties. These only affect the page content, not the navigation or any other ui elements */
+    --content-line-height: 27px;
+    /* The content is centered and constraint in it's width. To make the content fill the whole page, set the variable to auto.*/
+    --content-maxwidth: 1000px;
+
+    /* colors for various content boxes: @warning, @note, @deprecated @bug */
+    --warning-color: #fca49b;
+    --warning-color-dark: #b61825;
+    --warning-color-darker: #75070f;
+    --note-color: rgba(255,229,100,.3);
+    --note-color-dark: #c39900;
+    --note-color-darker: #8d7400;
+    --deprecated-color: rgb(214, 216, 224);
+    --deprecated-color-dark: #5b6269;
+    --deprecated-color-darker: #43454a;
+    --bug-color: rgb(246, 208, 178);
+    --bug-color-dark: #a53a00;
+    --bug-color-darker: #5b1d00;
+    --invariant-color: #b7f8d0;
+    --invariant-color-dark: #00ba44;
+    --invariant-color-darker: #008622;
+
+    /* blockquote colors */
+    --blockquote-background: #f5f5f5;
+    --blockquote-foreground: #727272;
+
+    /* table colors */
+    --tablehead-background: #f1f1f1;
+    --tablehead-foreground: var(--page-foreground-color);
+
+    /* menu-display: block | none
+     * Visibility of the top navigation on screens >= 768px. On smaller screen the menu is always visible.
+     * `GENERATE_TREEVIEW` MUST be enabled!
+     */
+    --menu-display: block;
+
+    --menu-focus-foreground: var(--page-background-color);
+    --menu-focus-background: var(--primary-color);
+    --menu-selected-background: rgba(0,0,0,.05);
+
+
+    --header-background: var(--page-background-color);
+    --header-foreground: var(--page-foreground-color);
+
+    /* searchbar colors */
+    --searchbar-background: var(--side-nav-background);
+    --searchbar-foreground: var(--page-foreground-color);
+
+    /* searchbar size
+     * (`searchbar-width` is only applied on screens >= 768px.
+     * on smaller screens the searchbar will always fill the entire screen width) */
+    --searchbar-height: 33px;
+    --searchbar-width: 170px;
+
+    /* code block colors */
+    --code-background: #e0e0e0cf;
+    --code-foreground: var(--page-foreground-color);
+
+    /* fragment colors */
+    --fragment-background: #282c34;
+    --fragment-foreground: #ffffff;
+    --fragment-keyword: #cc99cd;
+    --fragment-keywordtype: #ab99cd;
+    --fragment-keywordflow: #e08000;
+    --fragment-token: #7ec699;
+    --fragment-comment: #999999;
+    --fragment-link: #98c0e3;
+    --fragment-preprocessor: #65cabe;
+    --fragment-linenumber-color: #cccccc;
+    --fragment-linenumber-background: #35393c;
+    --fragment-linenumber-border: #1f1f1f;
+    --fragment-lineheight: 20px;
+
+    /* sidebar navigation (treeview) colors */
+    --side-nav-background: #fbfbfb;
+    --side-nav-foreground: var(--page-foreground-color);
+    --side-nav-arrow-opacity: 0;
+    --side-nav-arrow-hover-opacity: 0.9;
+
+    /* height of an item in any tree / collapsable table */
+    --tree-item-height: 30px;
+
+    /* --darkmode-toggle-button-icon: '☀️' */
+}
+
+@media screen and (max-width: 767px) {
+    html {
+        --page-font-size: 16px;
+        --navigation-font-size: 16px;
+        --code-font-size: 15px; /* affects code, fragment */
+        --title-font-size: 22px;
+    }
+}
+
+@media (prefers-color-scheme: dark) {
+    html:not(.light-mode) {
+        color-scheme: dark;
+
+        --primary-color: #1982d2;
+        --primary-dark-color: #5ca8e2;
+        --primary-light-color: #4779ac;
+        --primary-lighter-color: #191e21;
+        --primary-lightest-color: #191a1c;
+
+        --box-shadow: 0 2px 10px 0 rgba(0,0,0,.35);
+
+        --odd-color: rgba(0,0,0,.1);
+
+        --menu-selected-background: rgba(0,0,0,.4);
+
+        --page-background-color: #1C1D1F;
+        --page-foreground-color: #d2dbde;
+        --page-secondary-foreground-color: #859399;
+        --separator-color: #000000;
+        --side-nav-background: #252628;
+
+        --code-background: #2a2c2f;
+
+        --tablehead-background: #2a2c2f;
+
+        --blockquote-background: #1f2022;
+        --blockquote-foreground: #77848a;
+
+        --warning-color: #b61825;
+        --warning-color-dark: #510a02;
+        --warning-color-darker: #f5b1aa;
+        --note-color: rgb(255, 183, 0);
+        --note-color-dark: #9f7300;
+        --note-color-darker: #645b39;
+        --deprecated-color: rgb(88, 90, 96);
+        --deprecated-color-dark: #262e37;
+        --deprecated-color-darker: #a0a5b0;
+        --bug-color: rgb(248, 113, 0);
+        --bug-color-dark: #812a00;
+        --bug-color-darker: #ffd3be;
+
+    }
+}
+
+/* dark mode variables are defined twice, to support both the dark-mode without and with doxygen-awesome-darkmode-toggle.js */
+html.dark-mode {
+    color-scheme: dark;
+
+    --primary-color: #1982d2;
+    --primary-dark-color: #5ca8e2;
+    --primary-light-color: #4779ac;
+    --primary-lighter-color: #191e21;
+    --primary-lightest-color: #191a1c;
+
+    --box-shadow: 0 2px 10px 0 rgba(0,0,0,.35);
+
+    --odd-color: rgba(0,0,0,.1);
+
+    --menu-selected-background: rgba(0,0,0,.4);
+
+    --page-background-color: #1C1D1F;
+    --page-foreground-color: #d2dbde;
+    --page-secondary-foreground-color: #859399;
+    --separator-color: #000000;
+    --side-nav-background: #252628;
+
+    --code-background: #2a2c2f;
+
+    --tablehead-background: #2a2c2f;
+
+    --blockquote-background: #1f2022;
+    --blockquote-foreground: #77848a;
+
+    --warning-color: #b61825;
+    --warning-color-dark: #510a02;
+    --warning-color-darker: #f5b1aa;
+    --note-color: rgb(255, 183, 0);
+    --note-color-dark: #9f7300;
+    --note-color-darker: #645b39;
+    --deprecated-color: rgb(88, 90, 96);
+    --deprecated-color-dark: #262e37;
+    --deprecated-color-darker: #a0a5b0;
+    --bug-color: rgb(248, 113, 0);
+    --bug-color-dark: #812a00;
+    --bug-color-darker: #ffd3be;
+
+}
+
+body {
+    color: var(--page-foreground-color);
+    background-color: var(--page-background-color);
+    font-size: var(--page-font-size);
+}
+
+body, table, div, p, dl, #nav-tree .label, .title, .sm-dox a, .sm-dox a:hover, .sm-dox a:focus, #projectname, .SelectItem, #MSearchField, .navpath li.navelem a, .navpath li.navelem a:hover {
+    font-family: var(--font-family);
+}
+
+h1, h2, h3, h4, h5 {
+    margin-top: .9em;
+    font-weight: 600;
+    line-height: initial;
+}
+
+p, div, table, dl {
+    font-size: var(--page-font-size);
+}
+
+a:link, a:visited, a:hover, a:focus, a:active {
+    color: var(--primary-color) !important;
+    font-weight: 500;
+}
+
+/*
+ Title and top navigation
+ */
+
+#top {
+    background: var(--header-background);
+    border-bottom: 1px solid var(--separator-color);
+}
+
+@media screen and (min-width: 768px) {
+    #top {
+        display: flex;
+        flex-wrap: wrap;
+        justify-content: space-between;
+        align-items: center;
+    }
+}
+
+#main-nav {
+    flex-grow: 5;
+    padding: var(--spacing-small) var(--spacing-medium);
+}
+
+#titlearea {
+    width: auto;
+    padding: var(--spacing-medium) var(--spacing-large);
+    background: none;
+    color: var(--header-foreground);
+    border-bottom: none;
+}
+
+@media screen and (max-width: 767px) {
+    #titlearea {
+        padding-bottom: var(--spacing-small);
+    }
+}
+
+#titlearea table tbody tr {
+    height: auto !important;
+}
+
+#projectname {
+    font-size: var(--title-font-size);
+    font-weight: 600;
+}
+
+#projectnumber {
+    font-family: inherit;
+    font-size: 60%;
+}
+
+#projectbrief {
+    font-family: inherit;
+    font-size: 80%;
+}
+
+#projectlogo {
+    vertical-align: middle;
+}
+
+#projectlogo img {
+    max-height: calc(var(--title-font-size) * 2);
+    margin-right: var(--spacing-small);
+}
+
+.sm-dox, .tabs, .tabs2, .tabs3 {
+    background: none;
+    padding: 0;
+}
+
+.tabs, .tabs2, .tabs3 {
+    border-bottom: 1px solid var(--separator-color);
+    margin-bottom: -1px;
+}
+
+@media screen and (max-width: 767px) {
+    .sm-dox a span.sub-arrow {
+        background: var(--code-background);
+    }
+}
+
+@media screen and (min-width: 768px) {
+    .sm-dox li, .tablist li {
+        display: var(--menu-display);
+    }
+
+    .sm-dox a span.sub-arrow {
+        border-color: var(--header-foreground) transparent transparent transparent;
+    }
+
+    .sm-dox a:hover span.sub-arrow {
+        border-color: var(--menu-focus-foreground) transparent transparent transparent;
+    }
+
+    .sm-dox ul a span.sub-arrow {
+        border-color: transparent transparent transparent var(--page-foreground-color);
+    }
+
+    .sm-dox ul a:hover span.sub-arrow {
+        border-color: transparent transparent transparent var(--menu-focus-foreground);
+    }
+}
+
+.sm-dox ul {
+    background: var(--page-background-color);
+    box-shadow: var(--box-shadow);
+    border: 1px solid var(--separator-color);
+    border-radius: var(--border-radius-medium) !important;
+    padding: var(--spacing-small);
+    animation: ease-out 150ms slideInMenu;
+}
+
+@keyframes slideInMenu {
+    from {
+        opacity: 0;
+        transform: translate(0px, -2px);
+    }
+
+    to {
+        opacity: 1;
+        transform: translate(0px, 0px);
+    }
+}
+
+.sm-dox ul a {
+    color: var(--page-foreground-color) !important;
+    background: var(--page-background-color);
+    font-size: var(--navigation-font-size);
+}
+
+.sm-dox>li>ul:after {
+    border-bottom-color: var(--page-background-color) !important;
+}
+
+.sm-dox>li>ul:before {
+    border-bottom-color: var(--separator-color) !important;
+}
+
+.sm-dox ul a:hover, .sm-dox ul a:active, .sm-dox ul a:focus {
+    font-size: var(--navigation-font-size) !important;
+    color: var(--menu-focus-foreground) !important;
+    text-shadow: none;
+    background-color: var(--menu-focus-background);
+    border-radius: var(--border-radius-small) !important;
+}
+
+.sm-dox a, .sm-dox a:focus, .tablist li, .tablist li a, .tablist li.current a {
+    text-shadow: none;
+    background: transparent;
+    background-image: none !important;
+    color: var(--header-foreground) !important;
+    font-weight: normal;
+    font-size: var(--navigation-font-size);
+}
+
+.sm-dox a:focus {
+    outline: auto;
+}
+
+.sm-dox a:hover, .sm-dox a:active, .tablist li a:hover {
+    text-shadow: none;
+    font-weight: normal;
+    background: var(--menu-focus-background);
+    color: var(--menu-focus-foreground) !important;
+    border-radius: var(--border-radius-small) !important;
+    font-size: var(--navigation-font-size);
+}
+
+.tablist li.current {
+    border-radius: var(--border-radius-small);
+    background: var(--menu-selected-background);
+}
+
+.tablist li {
+    margin: var(--spacing-small) 0 var(--spacing-small) var(--spacing-small);
+}
+
+.tablist a {
+    padding: 0 var(--spacing-large);
+}
+
+
+/*
+ Search box
+ */
+
+#MSearchBox {
+    height: var(--searchbar-height);
+    background: var(--searchbar-background);
+    border-radius: var(--searchbar-height);
+    border: 1px solid var(--separator-color);
+    overflow: hidden;
+    width: var(--searchbar-width);
+    position: relative;
+    box-shadow: none;
+    display: block;
+    margin-top: 0;
+}
+
+.left #MSearchSelect {
+    left: 0;
+}
+
+.tabs .left #MSearchSelect {
+    padding-left: 0;
+}
+
+.tabs #MSearchBox {
+    position: absolute;
+    right: var(--spacing-medium);
+}
+
+@media screen and (max-width: 767px) {
+    .tabs #MSearchBox {
+        position: relative;
+        right: 0;
+        margin-left: var(--spacing-medium);
+        margin-top: 0;
+    }
+}
+
+#MSearchSelectWindow, #MSearchResultsWindow {
+    z-index: 9999;
+}
+
+#MSearchBox.MSearchBoxActive {
+    border-color: var(--primary-color);
+    box-shadow: inset 0 0 0 1px var(--primary-color);
+}
+
+#main-menu > li:last-child {
+    margin-right: 0;
+}
+
+@media screen and (max-width: 767px) {
+    #main-menu > li:last-child {
+        height: 50px;
+    }
+}
+
+#MSearchField {
+    font-size: var(--navigation-font-size);
+    height: calc(var(--searchbar-height) - 2px);
+    background: transparent;
+    width: calc(var(--searchbar-width) - 64px);
+}
+
+.MSearchBoxActive #MSearchField {
+    color: var(--searchbar-foreground);
+}
+
+#MSearchSelect {
+    top: calc(calc(var(--searchbar-height) / 2) - 11px);
+}
+
+.left #MSearchSelect {
+    padding-left: 8px;
+}
+
+#MSearchBox span.left, #MSearchBox span.right {
+    background: none;
+}
+
+#MSearchBox span.right {
+    padding-top: calc(calc(var(--searchbar-height) / 2) - 12px);
+    position: absolute;
+    right: var(--spacing-small);
+}
+
+.tabs #MSearchBox span.right {
+    top: calc(calc(var(--searchbar-height) / 2) - 12px);
+}
+
+@keyframes slideInSearchResults {
+    from {
+        opacity: 0;
+        transform: translate(0, 15px);
+    }
+
+    to {
+        opacity: 1;
+        transform: translate(0, 20px);
+    }
+}
+
+#MSearchResultsWindow {
+    left: auto !important;
+    right: var(--spacing-medium);
+    border-radius: var(--border-radius-large);
+    border: 1px solid var(--separator-color);
+    transform: translate(0, 20px);
+    box-shadow: var(--box-shadow);
+    animation: ease-out 280ms slideInSearchResults;
+    background: var(--page-background-color);
+}
+
+iframe#MSearchResults {
+    margin: 4px;
+}
+
+iframe {
+    color-scheme: normal;
+}
+
+@media (prefers-color-scheme: dark) {
+    html:not(.light-mode) iframe#MSearchResults {
+        filter: invert() hue-rotate(180deg);
+    }
+}
+
+html.dark-mode iframe#MSearchResults {
+    filter: invert() hue-rotate(180deg);
+}
+
+#MSearchSelectWindow {
+    border: 1px solid var(--separator-color);
+    border-radius: var(--border-radius-medium);
+    box-shadow: var(--box-shadow);
+    background: var(--page-background-color);
+}
+
+#MSearchSelectWindow a.SelectItem {
+    font-size: var(--navigation-font-size);
+    line-height: var(--content-line-height);
+    margin: 0 var(--spacing-small);
+    border-radius: var(--border-radius-small);
+    color: var(--page-foreground-color) !important;
+    font-weight: normal;
+}
+
+#MSearchSelectWindow a.SelectItem:hover {
+    background: var(--menu-focus-background);
+    color: var(--menu-focus-foreground) !important;
+}
+
+@media screen and (max-width: 767px) {
+    #MSearchBox {
+        margin-top: var(--spacing-medium);
+        margin-bottom: var(--spacing-medium);
+        width: calc(100vw - 30px);
+    }
+
+    #main-menu > li:last-child {
+        float: none !important;
+    }
+
+    #MSearchField {
+        width: calc(100vw - 110px);
+    }
+
+    @keyframes slideInSearchResultsMobile {
+        from {
+            opacity: 0;
+            transform: translate(0, 15px);
+        }
+
+        to {
+            opacity: 1;
+            transform: translate(0, 20px);
+        }
+    }
+
+    #MSearchResultsWindow {
+        left: var(--spacing-medium) !important;
+        right: var(--spacing-medium);
+        overflow: auto;
+        transform: translate(0, 20px);
+        animation: ease-out 280ms slideInSearchResultsMobile;
+    }
+
+    /*
+     * Overwrites for fixing the searchbox on mobile in doxygen 1.9.2
+     */
+    label.main-menu-btn ~ #searchBoxPos1 {
+        top: 3px !important;
+        right: 6px !important;
+        left: 45px;
+        display: flex;
+    }
+
+    label.main-menu-btn ~ #searchBoxPos1 > #MSearchBox {
+        margin-top: 0;
+        margin-bottom: 0;
+        flex-grow: 2;
+        float: left;
+    }
+}
+
+/*
+ Tree view
+ */
+
+#side-nav {
+    padding: 0 !important;
+    background: var(--side-nav-background);
+}
+
+@media screen and (max-width: 767px) {
+    #side-nav {
+        display: none;
+    }
+
+    #doc-content {
+        margin-left: 0 !important;
+        height: auto !important;
+        padding-bottom: calc(2 * var(--spacing-large));
+    }
+}
+
+#nav-tree {
+    background: transparent;
+}
+
+#nav-tree .label {
+    font-size: var(--navigation-font-size);
+}
+
+#nav-tree .item {
+    height: var(--tree-item-height);
+    line-height: var(--tree-item-height);
+}
+
+#nav-sync {
+    top: 12px !important;
+    right: 12px;
+}
+
+#nav-tree .selected {
+    text-shadow: none;
+    background-image: none;
+    background-color: transparent;
+    box-shadow: inset 4px 0 0 0 var(--primary-color);
+}
+
+#nav-tree a {
+    color: var(--side-nav-foreground) !important;
+    font-weight: normal;
+}
+
+#nav-tree a:focus {
+    outline-style: auto;
+}
+
+#nav-tree .arrow {
+    opacity: var(--side-nav-arrow-opacity);
+}
+
+.arrow {
+    color: inherit;
+    cursor: pointer;
+    font-size: 45%;
+    vertical-align: middle;
+    margin-right: 2px;
+    font-family: serif;
+    height: auto;
+    text-align: right;
+}
+
+#nav-tree div.item:hover .arrow, #nav-tree a:focus .arrow {
+    opacity: var(--side-nav-arrow-hover-opacity);
+}
+
+#nav-tree .selected a {
+    color: var(--primary-color) !important;
+    font-weight: bolder;
+    font-weight: 600;
+}
+
+.ui-resizable-e {
+    background: var(--separator-color);
+    width: 1px;
+}
+
+/*
+ Contents
+ */
+
+div.header {
+    border-bottom: 1px solid var(--separator-color);
+    background-color: var(--page-background-color);
+    background-image: none;
+}
+
+div.contents, div.header .title, div.header .summary {
+    max-width: var(--content-maxwidth);
+}
+
+div.contents, div.header .title  {
+    line-height: initial;
+    margin: calc(var(--spacing-medium) + .2em) auto var(--spacing-medium) auto;
+}
+
+div.header .summary {
+    margin: var(--spacing-medium) auto 0 auto;
+}
+
+div.headertitle {
+    padding: 0;
+}
+
+div.header .title {
+    font-weight: 600;
+    font-size: 210%;
+    padding: var(--spacing-medium) var(--spacing-large);
+    word-break: break-word;
+}
+
+div.header .summary {
+    width: auto;
+    display: block;
+    float: none;
+    padding: 0 var(--spacing-large);
+}
+
+td.memSeparator {
+    border-color: var(--separator-color);
+    padding-bottom: 10px;
+}
+
+.mdescLeft, .mdescRight, .memItemLeft, .memItemRight, .memTemplItemLeft, .memTemplItemRight, .memTemplParams {
+    background: var(--code-background);
+    padding: 20px 20px;
+}
+
+.mdescRight {
+    color: var(--page-secondary-foreground-color);
+}
+
+span.mlabel {
+    background: var(--primary-color);
+    border: none;
+    padding: 4px 9px;
+    border-radius: 12px;
+    margin-right: var(--spacing-medium);
+}
+
+span.mlabel:last-of-type {
+    margin-right: 2px;
+}
+
+div.contents {
+    padding: 0 var(--spacing-large);
+}
+
+div.contents p, div.contents li {
+    line-height: var(--content-line-height);
+}
+
+div.contents div.dyncontent {
+    margin: var(--spacing-medium) 0;
+}
+
+@media (prefers-color-scheme: dark) {
+    html:not(.light-mode) div.contents div.dyncontent img,
+    html:not(.light-mode) div.contents center img,
+    html:not(.light-mode) div.contents table img,
+    html:not(.light-mode) div.contents div.dyncontent iframe,
+    html:not(.light-mode) div.contents center iframe,
+    html:not(.light-mode) div.contents table iframe {
+        filter: hue-rotate(180deg) invert();
+    }
+}
+
+html.dark-mode div.contents div.dyncontent img,
+html.dark-mode div.contents center img,
+html.dark-mode div.contents table img,
+html.dark-mode div.contents div.dyncontent iframe,
+html.dark-mode div.contents center iframe,
+html.dark-mode div.contents table iframe {
+    filter: hue-rotate(180deg) invert();
+}
+
+h2.groupheader {
+    border-bottom: 1px solid var(--separator-color);
+    color: var(--page-foreground-color);
+}
+
+blockquote {
+    padding: var(--spacing-small) var(--spacing-medium);
+    background: var(--blockquote-background);
+    color: var(--blockquote-foreground);
+    border-left: 2px solid var(--blockquote-foreground);
+    margin: 0;
+}
+
+blockquote p {
+    margin: var(--spacing-small) 0 var(--spacing-medium) 0;
+}
+.paramname {
+    font-weight: 600;
+    color: var(--primary-dark-color);
+}
+
+.glow {
+    text-shadow: 0 0 15px var(--primary-light-color) !important;
+}
+
+.alphachar a {
+    color: var(--page-foreground-color);
+}
+
+/*
+ Table of Contents
+ */
+
+div.toc {
+    background-color: var(--side-nav-background);
+    border: 1px solid var(--separator-color);
+    border-radius: var(--border-radius-medium);
+    box-shadow: var(--box-shadow);
+    padding: 0 var(--spacing-large);
+    margin: 0 0 var(--spacing-medium) var(--spacing-medium);
+}
+
+div.toc h3 {
+    color: var(--side-nav-foreground);
+    font-size: var(--navigation-font-size);
+    margin: var(--spacing-large) 0;
+}
+
+div.toc li {
+    font-size: var(--navigation-font-size);
+    padding: 0;
+    background: none;
+}
+
+div.toc li:before {
+    content: '↓';
+    font-weight: 800;
+    font-family: var(--font-family);
+    margin-right: var(--spacing-small);
+    color: var(--side-nav-foreground);
+    opacity: .4;
+}
+
+div.toc ul li.level1 {
+    margin: 0;
+}
+
+div.toc ul li.level2, div.toc ul li.level3 {
+    margin-top: 0;
+}
+
+
+@media screen and (max-width: 767px) {
+    div.toc {
+        float: none;
+        width: auto;
+        margin: 0 0 var(--spacing-medium) 0;
+    }
+}
+
+/*
+ Code & Fragments
+ */
+
+code, div.fragment, pre.fragment {
+    border-radius: var(--border-radius-small);
+    border: none;
+    overflow: hidden;
+}
+
+code {
+    display: inline;
+    background: var(--code-background);
+    color: var(--code-foreground);
+    padding: 2px 6px;
+    word-break: break-word;
+}
+
+div.fragment, pre.fragment {
+    margin: var(--spacing-medium) 0;
+    padding: 14px 16px;
+    background: var(--fragment-background);
+    color: var(--fragment-foreground);
+    overflow-x: auto;
+}
+
+@media screen and (max-width: 767px) {
+    div.fragment, pre.fragment {
+        border-top-right-radius: 0;
+        border-bottom-right-radius: 0;
+    }
+
+    .contents > div.fragment, .textblock > div.fragment, .textblock > pre.fragment {
+        margin: var(--spacing-medium) calc(0px - var(--spacing-large));
+        border-radius: 0;
+    }
+
+    .textblock li > .fragment {
+        margin: var(--spacing-medium) calc(0px - var(--spacing-large));
+    }
+
+    .memdoc li > .fragment {
+        margin: var(--spacing-medium) calc(0px - var(--spacing-medium));
+    }
+
+    .memdoc > div.fragment, .memdoc > pre.fragment, dl dd > div.fragment, dl dd pre.fragment {
+        margin: var(--spacing-medium) calc(0px - var(--spacing-medium));
+        border-radius: 0;
+    }
+}
+
+code, code a, pre.fragment, div.fragment, div.fragment .line, div.fragment span, div.fragment .line a, div.fragment .line span {
+    font-family: var(--font-family-monospace);
+    font-size: var(--code-font-size) !important;
+}
+
+div.line:after {
+    margin-right: var(--spacing-medium);
+}
+
+div.fragment .line, pre.fragment {
+    white-space: pre;
+    word-wrap: initial;
+    line-height: var(--fragment-lineheight);
+}
+
+div.fragment span.keyword {
+    color: var(--fragment-keyword);
+}
+
+div.fragment span.keywordtype {
+    color: var(--fragment-keywordtype);
+}
+
+div.fragment span.keywordflow {
+    color: var(--fragment-keywordflow);
+}
+
+div.fragment span.stringliteral {
+    color: var(--fragment-token)
+}
+
+div.fragment span.comment {
+    color: var(--fragment-comment);
+}
+
+div.fragment a.code {
+    color: var(--fragment-link) !important;
+}
+
+div.fragment span.preprocessor {
+    color: var(--fragment-preprocessor);
+}
+
+div.fragment span.lineno {
+    display: inline-block;
+    width: 27px;
+    border-right: none;
+    background: var(--fragment-linenumber-background);
+    color: var(--fragment-linenumber-color);
+}
+
+div.fragment span.lineno a {
+    background: none;
+    color: var(--fragment-link) !important;
+}
+
+div.fragment .line:first-child .lineno {
+    box-shadow: -999999px 0px 0 999999px var(--fragment-linenumber-background), -999998px 0px 0 999999px var(--fragment-linenumber-border);
+}
+
+/*
+ dl warning, attention, note, deprecated, bug, ...
+ */
+
+dl.warning, dl.attention, dl.note, dl.deprecated, dl.bug, dl.invariant, dl.pre {
+    padding: var(--spacing-medium);
+    margin: var(--spacing-medium) 0;
+    color: var(--page-background-color);
+    overflow: hidden;
+    margin-left: 0;
+    border-radius: var(--border-radius-small);
+}
+
+dl.section dd {
+    margin-bottom: 2px;
+}
+
+dl.warning, dl.attention {
+    background: var(--warning-color);
+    border-left: 8px solid var(--warning-color-dark);
+    color: var(--warning-color-darker);
+}
+
+dl.warning dt, dl.attention dt {
+    color: var(--warning-color-dark);
+}
+
+dl.note {
+    background: var(--note-color);
+    border-left: 8px solid var(--note-color-dark);
+    color: var(--note-color-darker);
+}
+
+dl.note dt {
+    color: var(--note-color-dark);
+}
+
+dl.bug {
+    background: var(--bug-color);
+    border-left: 8px solid var(--bug-color-dark);
+    color: var(--bug-color-darker);
+}
+
+dl.bug dt a {
+    color: var(--bug-color-dark) !important;
+}
+
+dl.deprecated {
+    background: var(--deprecated-color);
+    border-left: 8px solid var(--deprecated-color-dark);
+    color: var(--deprecated-color-darker);
+}
+
+dl.deprecated dt a {
+    color: var(--deprecated-color-dark) !important;
+}
+
+dl.section dd, dl.bug dd, dl.deprecated dd {
+    margin-inline-start: 0px;
+}
+
+dl.invariant, dl.pre {
+    background: var(--invariant-color);
+    border-left: 8px solid var(--invariant-color-dark);
+    color: var(--invariant-color-darker);
+}
+
+/*
+ memitem
+ */
+
+div.memdoc, div.memproto, h2.memtitle {
+    box-shadow: none;
+    background-image: none;
+    border: none;
+}
+
+div.memdoc {
+    padding: 0 var(--spacing-medium);
+    background: var(--page-background-color);
+}
+
+h2.memtitle, div.memitem {
+    border: 1px solid var(--separator-color);
+}
+
+div.memproto, h2.memtitle {
+    background: var(--code-background);
+    text-shadow: none;
+}
+
+h2.memtitle {
+    font-weight: 500;
+    font-family: monospace, fixed;
+    border-bottom: none;
+    border-top-left-radius: var(--border-radius-medium);
+    border-top-right-radius: var(--border-radius-medium);
+    word-break: break-all;
+}
+
+a:target + h2.memtitle, a:target + h2.memtitle + div.memitem {
+    border-color: var(--primary-light-color);
+}
+
+a:target + h2.memtitle {
+    box-shadow: -3px -3px 3px 0 var(--primary-lightest-color), 3px -3px 3px 0 var(--primary-lightest-color);
+}
+
+a:target + h2.memtitle + div.memitem {
+    box-shadow: 0 0 10px 0 var(--primary-lighter-color);
+}
+
+div.memitem {
+    border-top-right-radius: var(--border-radius-medium);
+    border-bottom-right-radius: var(--border-radius-medium);
+    border-bottom-left-radius: var(--border-radius-medium);
+    overflow: hidden;
+    display: block !important;
+}
+
+div.memdoc {
+    border-radius: 0;
+}
+
+div.memproto {
+    border-radius: 0 var(--border-radius-small) 0 0;
+    overflow: auto;
+    border-bottom: 1px solid var(--separator-color);
+    padding: var(--spacing-medium);
+    margin-bottom: -1px;
+}
+
+div.memtitle {
+    border-top-right-radius: var(--border-radius-medium);
+    border-top-left-radius: var(--border-radius-medium);
+}
+
+div.memproto table.memname {
+    font-family: monospace, fixed;
+    color: var(--page-foreground-color);
+}
+
+table.mlabels, table.mlabels > tbody {
+    display: block;
+}
+
+td.mlabels-left {
+    width: auto;
+}
+
+table.mlabels > tbody > tr:first-child {
+    display: flex;
+    justify-content: space-between;
+    flex-wrap: wrap;
+}
+
+.memname, .memitem span.mlabels {
+    margin: 0
+}
+
+/*
+ reflist
+ */
+
+dl.reflist {
+    box-shadow: var(--box-shadow);
+    border-radius: var(--border-radius-medium);
+    border: 1px solid var(--separator-color);
+    overflow: hidden;
+    padding: 0;
+}
+
+
+dl.reflist dt, dl.reflist dd {
+    box-shadow: none;
+    text-shadow: none;
+    background-image: none;
+    border: none;
+    padding: 12px;
+}
+
+
+dl.reflist dt {
+    font-weight: 500;
+    border-radius: 0;
+    background: var(--code-background);
+    border-bottom: 1px solid var(--separator-color);
+    color: var(--page-foreground-color)
+}
+
+
+dl.reflist dd {
+    background: none;
+}
+
+/*
+ Table
+ */
+
+table.markdownTable, table.fieldtable {
+    width: 100%;
+    border: 1px solid var(--separator-color);
+    margin: var(--spacing-medium) 0;
+}
+
+table.fieldtable {
+    box-shadow: none;
+    border-radius: var(--border-radius-small);
+}
+
+th.markdownTableHeadLeft, th.markdownTableHeadRight, th.markdownTableHeadCenter, th.markdownTableHeadNone {
+    background: var(--tablehead-background);
+    color: var(--tablehead-foreground);
+    font-weight: 600;
+    font-size: var(--page-font-size);
+}
+
+table.markdownTable td, table.markdownTable th, table.fieldtable dt {
+    border: 1px solid var(--separator-color);
+    padding: var(--spacing-small) var(--spacing-medium);
+}
+
+table.fieldtable th {
+    font-size: var(--page-font-size);
+    font-weight: 600;
+    background-image: none;
+    background-color: var(--tablehead-background);
+    color: var(--tablehead-foreground);
+    border-bottom: 1px solid var(--separator-color);
+}
+
+.fieldtable td.fieldtype, .fieldtable td.fieldname {
+    border-bottom: 1px solid var(--separator-color);
+    border-right: 1px solid var(--separator-color);
+}
+
+.fieldtable td.fielddoc {
+    border-bottom: 1px solid var(--separator-color);
+}
+
+.memberdecls td.glow, .fieldtable tr.glow {
+    background-color: var(--primary-light-color);
+    box-shadow: 0 0 15px var(--primary-lighter-color);
+}
+
+table.memberdecls {
+    display: block;
+    overflow-x: auto;
+    overflow-y: hidden;
+}
+
+
+/*
+ Horizontal Rule
+ */
+
+hr {
+    margin-top: var(--spacing-large);
+    margin-bottom: var(--spacing-large);
+    border-top:1px solid var(--separator-color);
+}
+
+.contents hr {
+    box-shadow: var(--content-maxwidth) 0 0 0 var(--separator-color), calc(0px - var(--content-maxwidth)) 0 0 0 var(--separator-color);
+}
+
+.contents img {
+    max-width: 100%;
+}
+
+/*
+ Directories
+ */
+div.directory {
+    border-top: 1px solid var(--separator-color);
+    border-bottom: 1px solid var(--separator-color);
+    width: auto;
+}
+
+table.directory {
+    font-family: var(--font-family);
+    font-size: var(--page-font-size);
+    font-weight: normal;
+}
+
+.directory td.entry {
+    padding: var(--spacing-small);
+    display: flex;
+    align-items: center;
+}
+
+.directory tr.even {
+    background-color: var(--odd-color);
+}
+
+.icona {
+    width: auto;
+    height: auto;
+    margin: 0 var(--spacing-small);
+}
+
+.icon {
+    background: var(--primary-color);
+    width: 18px;
+    height: 18px;
+    line-height: 18px;
+}
+
+.iconfopen, .icondoc, .iconfclosed {
+    background-position: center;
+    margin-bottom: 0;
+}
+
+.icondoc {
+    filter: saturate(0.2);
+}
+
+@media screen and (max-width: 767px) {
+    div.directory {
+        margin-left: calc(0px - var(--spacing-medium));
+        margin-right: calc(0px - var(--spacing-medium));
+    }
+}
+
+@media (prefers-color-scheme: dark) {
+    html:not(.light-mode) .iconfopen, html:not(.light-mode) .iconfclosed {
+        filter: hue-rotate(180deg) invert();
+    }
+}
+
+html.dark-mode .iconfopen, html.dark-mode .iconfclosed {
+    filter: hue-rotate(180deg) invert();
+}
+
+/*
+ Class list
+ */
+
+.classindex dl.odd {
+    background: var(--odd-color);
+    border-radius: var(--border-radius-small);
+}
+
+@media screen and (max-width: 767px) {
+    .classindex {
+        margin: 0 calc(0px - var(--spacing-small));
+    }
+}
+
+/*
+  Footer and nav-path
+ */
+
+#nav-path {
+    margin-bottom: -1px;
+    width: 100%;
+}
+
+#nav-path ul {
+    background-image: none;
+    background: var(--page-background-color);
+    border: none;
+    border-top: 1px solid var(--separator-color);
+    border-bottom: 1px solid var(--separator-color);
+    font-size: var(--navigation-font-size);
+}
+
+img.footer {
+    width: 60px;
+}
+
+.navpath li.footer {
+    color: var(--page-secondary-foreground-color);
+}
+
+address.footer {
+    margin-bottom: var(--spacing-large);
+}
+
+#nav-path li.navelem {
+    background-image: none;
+    display: flex;
+    align-items: center;
+}
+
+.navpath li.navelem a {
+    text-shadow: none;
+    display: inline-block;
+    color: var(--primary-color) !important;
+}
+
+.navpath li.navelem b {
+    color: var(--primary-dark-color);
+    font-weight: 500;
+}
+
+li.navelem {
+    padding: 0;
+    margin-left: -8px;
+}
+
+li.navelem:first-child {
+    margin-left: var(--spacing-large);
+}
+
+li.navelem:first-child:before {
+    display: none;
+}
+
+#nav-path li.navelem:after {
+    content: '';
+    border: 5px solid var(--page-background-color);
+    border-bottom-color: transparent;
+    border-right-color: transparent;
+    border-top-color: transparent;
+    transform: scaleY(4.2);
+    z-index: 10;
+    margin-left: 6px;
+}
+
+#nav-path li.navelem:before {
+    content: '';
+    border: 5px solid var(--separator-color);
+    border-bottom-color: transparent;
+    border-right-color: transparent;
+    border-top-color: transparent;
+    transform: scaleY(3.2);
+    margin-right: var(--spacing-small);
+}
+
+.navpath li.navelem a:hover {
+    color: var(--primary-color);
+}
+
+/*
+  Optional Dark mode toggle button
+*/
+
+doxygen-awesome-dark-mode-toggle {
+    display: inline-block;
+    margin: 0 0 0 var(--spacing-small);
+    padding: 0;
+    width: var(--searchbar-height);
+    height: var(--searchbar-height);
+    background: none;
+    border: none;
+    font-size: 23px;
+    border-radius: var(--border-radius-medium);
+    vertical-align: middle;
+    text-align: center;
+    line-height: var(--searchbar-height);
+}
+
+doxygen-awesome-dark-mode-toggle:hover {
+    background: var(--separator-color);
+}
+
+doxygen-awesome-dark-mode-toggle:after {
+    content: var(--darkmode-toggle-button-icon)
+}
diff --git a/docs/styling/footer.html b/docs/styling/footer.html
new file mode 100644
index 0000000000..d68520e1e9
--- /dev/null
+++ b/docs/styling/footer.html
@@ -0,0 +1,43 @@
+<!--
+ Copyright (C) 2023, Advanced Micro Devices. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 1. Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+ 3. Neither the name of the copyright holder nor the names of its contributors
+    may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE. -->
+
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+ <html xmlns="http://www.w3.org/1999/xhtml">
+    <style>
+        .footer {
+          position: relative;
+          left: 0;
+          bottom: 0;
+          width: 100%;
+          background-color: rgba(22, 22, 22, 0);
+          text-align: center;
+          padding: 50px 0px 25px 0px;
+        }
+        </style>
+ <body>
+    <div class = "footer"> &nbsp; Copyright (C) 2023, Advanced Micro Devices. All rights reserved. </div>
+ </body>
+ </html>
diff --git a/docs/styling/header.html b/docs/styling/header.html
new file mode 100644
index 0000000000..0c289cbee7
--- /dev/null
+++ b/docs/styling/header.html
@@ -0,0 +1,87 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen $doxygenversion"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+
+<!-- BEGIN opengraph metadata -->
+<meta property="og:title" content="Doxygen Awesome" />
+<meta property="og:image" content="https://repository-images.githubusercontent.com/348492097/4f16df80-88fb-11eb-9d31-4015ff22c452" />
+<meta property="og:description" content="Custom CSS theme for doxygen html-documentation with lots of customization parameters." />
+<meta property="og:url" content="https://jothepro.github.io/doxygen-awesome-css/" />
+<!-- END opengraph metadata -->
+
+<!-- BEGIN twitter metadata -->
+<meta name="twitter:image:src" content="https://repository-images.githubusercontent.com/348492097/4f16df80-88fb-11eb-9d31-4015ff22c452" />
+<meta name="twitter:title" content="Doxygen Awesome" />
+<meta name="twitter:description" content="Custom CSS theme for doxygen html-documentation with lots of customization parameters." />
+<!-- END twitter metadata -->
+
+<!--BEGIN PROJECT_NAME--><title>$projectname: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<link rel="icon" type="image/svg+xml" href="logo.drawio.svg"/>
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
+<script type="text/javascript" src="$relpath^doxygen-darkmode-toggle.js"></script>
+<script type="text/javascript" src="$relpath^doxygen-fragment-copy-button.js"></script>
+<!-- <script type="text/javascript" src="$relpath^doxygen-awesome-paragraph-link.js"></script> -->
+<script type="text/javascript" src="$relpath^doxygen-interactive-toc.js"></script>
+<!-- <script type="text/javascript" src="$relpath^toggle-alternative-theme.js"></script> -->
+<script type="text/javascript">
+    DoxygenAwesomeFragmentCopyButton.init()
+    DoxygenAwesomeDarkModeToggle.init()
+    DoxygenAwesomeParagraphLink.init()
+    DoxygenAwesomeInteractiveToc.init()
+</script>
+$treeview
+$search
+$mathjax
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
+$extrastylesheet
+</head>
+<body>
+
+<!-- https://tholman.com/github-corners/ -->
+<a href="https://github.com/jothepro/doxygen-awesome-css" class="github-corner" title="View source on GitHub" target="_blank">
+    <path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a><style>.github-corner:hover .octo-arm{animation:octocat-wave 560ms ease-in-out}@keyframes octocat-wave{0%,100%{transform:rotate(0)}20%,60%{transform:rotate(-25deg)}40%,80%{transform:rotate(10deg)}}@media (max-width:500px){.github-corner:hover .octo-arm{animation:none}.github-corner .octo-arm{animation:octocat-wave 560ms ease-in-out}}</style>
+
+
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <!--BEGIN PROJECT_LOGO-->
+  <td id="projectlogo"><img alt="Logo" src="$relpath^$projectlogo"/></td>
+  <!--END PROJECT_LOGO-->
+  <!--BEGIN PROJECT_NAME-->
+  <td id="projectalign" style="padding-left: 0.5em;">
+   <div id="projectname">$projectname
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </td>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <td style="padding-left: 0.5em;">
+    <div id="projectbrief">$projectbrief</div>
+    </td>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <td>$searchbox</td>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+ </tr>
+ </tbody>
+</table>
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index 2b3c4bb4ab..bfcc38fd5c 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -336,6 +337,70 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 INSERT_GENTFUNC_BASIC0( scal2m )
 
 
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname ) \
+\
+void PASTEMAC2(ch,opname,EX_SUF) \
+     ( \
+       conj_t  conjalpha, \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       uplo_t  uplox, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  alpha, \
+       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       BLIS_TAPI_EX_PARAMS  \
+     ) \
+{ \
+	bli_init_once(); \
+\
+	BLIS_TAPI_EX_DECLS \
+\
+	if ( bli_zero_dim2( m, n ) ) return; \
+\
+	/* Obtain a valid context from the gks if necessary. */ \
+	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
+\
+	/* Invoke setm function if alpha is zero. */ \
+	if ( PASTEMAC(ch,eq0)(*alpha)) \
+	{ \
+		PASTEMAC2(ch,setm,_unb_var1) \
+		( \
+		  conjalpha, \
+		  diagoffx, \
+		  diagx, \
+		  uplox, \
+		  m, \
+		  n, \
+		  alpha, \
+		  x, rs_x, cs_x, \
+		  cntx, \
+		  rntm  \
+		); \
+	} \
+	else \
+	{ \
+		/* Invoke the helper variant, which loops over the appropriate kernel
+		to implement the current operation. */ \
+		PASTEMAC2(ch,opname,_unb_var1) \
+		( \
+		conjalpha, \
+		diagoffx, \
+		diagx, \
+		uplox, \
+		m, \
+		n, \
+		alpha, \
+		x, rs_x, cs_x, \
+		cntx, \
+		rntm  \
+		); \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC0( scalm )
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
@@ -378,7 +443,6 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	); \
 }
 
-INSERT_GENTFUNC_BASIC0( scalm )
 INSERT_GENTFUNC_BASIC0( setm )
 
 
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index a3b2d66e63..9b63c2c786 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -497,7 +497,7 @@ void PASTEMAC(ch,varname) \
 			   corresponding elements in c11 were not already zero. */ \
 			if ( bli_is_hermitian( strucc ) ) \
 			{ \
-				ctype* restrict pi11 = p11; \
+				ctype* restrict __attribute__ ((unused)) pi11 = p11; \
 \
 				for ( i = 0; i < p11_m; ++i ) \
 				{ \
diff --git a/frame/1m/packm/bli_packm_struc_cxk_md.c b/frame/1m/packm/bli_packm_struc_cxk_md.c
index 52a1f9817f..d9aed99c0f 100644
--- a/frame/1m/packm/bli_packm_struc_cxk_md.c
+++ b/frame/1m/packm/bli_packm_struc_cxk_md.c
@@ -330,9 +330,9 @@ void PASTEMAC2(cha,chp,opname) \
 	PASTEMAC(chp,ctyper)* restrict kappa_r  = ( PASTEMAC(chp,ctyper)* )kappa; \
 	PASTEMAC(chp,ctyper)* restrict kappa_i  = ( PASTEMAC(chp,ctyper)* )kappa + 1; \
 	PASTEMAC(cha,ctyper)* restrict alpha1_r = ( PASTEMAC(cha,ctyper)* )a; \
-	PASTEMAC(cha,ctyper)* restrict alpha1_i = ( PASTEMAC(cha,ctyper)* )a + 1; \
+	PASTEMAC(cha,ctyper)* restrict __attribute__ ((unused)) alpha1_i = ( PASTEMAC(cha,ctyper)* )a + 1; \
 	PASTEMAC(chp,ctyper)* restrict pi1_r    = ( PASTEMAC(chp,ctyper)* )p; \
-	PASTEMAC(chp,ctyper)* restrict pi1_i    = ( PASTEMAC(chp,ctyper)* )p + ldp; \
+	PASTEMAC(chp,ctyper)* restrict __attribute__ ((unused)) pi1_i    = ( PASTEMAC(chp,ctyper)* )p + ldp; \
 \
 	( void )kappa_i; \
 \
@@ -454,9 +454,9 @@ void PASTEMAC2(cha,chp,opname) \
 	const inc_t       lda1      = lda; \
 	const inc_t       ldp1      = ldp; \
 \
-	ctype_a* restrict alpha1_ri = ( ctype_a* )a; \
-	ctype_p* restrict pi1_ri    = ( ctype_p* )p; \
-	ctype_p* restrict pi1_ir    = ( ctype_p* )p + ldp1/2; \
+	ctype_a* restrict __attribute__ ((unused)) alpha1_ri = ( ctype_a* )a; \
+	ctype_p* restrict __attribute__ ((unused)) pi1_ri    = ( ctype_p* )p; \
+	ctype_p* restrict __attribute__ ((unused)) pi1_ir    = ( ctype_p* )p + ldp1/2; \
 \
 	( void )inca1; \
 \
diff --git a/frame/2/bli_l2_ker_prot.h b/frame/2/bli_l2_ker_prot.h
index 5182b5d670..d9b4f99d48 100644
--- a/frame/2/bli_l2_ker_prot.h
+++ b/frame/2/bli_l2_ker_prot.h
@@ -66,4 +66,4 @@ void PASTEMAC(ch,opname) \
        ctype*    restrict x, inc_t incx, \
        ctype*    restrict c, inc_t rs_c, inc_t cs_c, \
        cntx_t*   restrict cntx \
-     );
\ No newline at end of file
+     );
diff --git a/frame/2/gemv/bli_gemv_unf_var1_amd.c b/frame/2/gemv/bli_gemv_unf_var1_amd.c
index a9534bd9a0..05f7fbb875 100644
--- a/frame/2/gemv/bli_gemv_unf_var1_amd.c
+++ b/frame/2/gemv/bli_gemv_unf_var1_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -143,9 +143,9 @@ void bli_dgemv_unf_var1
 
     conja = bli_extract_conj(transa);
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         if ( cntx == NULL ) cntx = bli_gks_query_cntx();
         const num_t dt = PASTEMAC(d,type);
@@ -460,9 +460,9 @@ void bli_sgemv_unf_var1
 
     conja = bli_extract_conj( transa );
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         if ( cntx == NULL ) cntx = bli_gks_query_cntx();
         const num_t dt = PASTEMAC(s,type);
@@ -501,13 +501,6 @@ void bli_sgemv_unf_var1
       return;
     }
 
-// If both multithreading and OpenMP are enabled, GEMV will multithread
-#if defined(BLIS_ENABLE_MULTITHREADING) && defined(BLIS_ENABLE_OPENMP)
-    bool is_omp_mt_enabled = TRUE;
-#else
-    bool is_omp_mt_enabled = FALSE;
-#endif
-
     dim_t nt_max;
 
     rntm_t rnmt_obj;
@@ -517,9 +510,23 @@ void bli_sgemv_unf_var1
     // Query the total number of threads from the rntm_t object.
     nt_max = bli_rntm_num_threads( &rnmt_obj );
 
-    if ( ( nt_max > 1 ) & ( is_omp_mt_enabled == TRUE ) )
+    if (nt_max<=0)
     {
+        // nt is less than one if BLIS manual setting of parallelism
+        // has been used. Parallelism here will be product of values.
+        dim_t jc, pc, ic, jr, ir;
+        jc = bli_rntm_jc_ways( &rnmt_obj );
+        pc = bli_rntm_pc_ways( &rnmt_obj );
+        ic = bli_rntm_ic_ways( &rnmt_obj );
+        jr = bli_rntm_jr_ways( &rnmt_obj );
+        ir = bli_rntm_ir_ways( &rnmt_obj );
+        nt_max = jc*pc*ic*jr*ir;
+    }
+
+// If OpenMP is enabled, GEMV will multithread
 #ifdef BLIS_ENABLE_OPENMP
+    if ( nt_max > 1 )
+    {
         b_fuse = 4;
 
         //Setting the thread count to the maximum number of threads provided
@@ -545,10 +552,10 @@ void bli_sgemv_unf_var1
           cntx,
           nt
         );
-#endif// BLIS_ENABLE_OPENMP
     }
     else
     {
+#endif// BLIS_ENABLE_OPENMP
         b_fuse = 8;
 
         for ( i = 0; i < n_iter; i += f )
@@ -575,7 +582,9 @@ void bli_sgemv_unf_var1
               cntx
             );
         }
+#ifdef BLIS_ENABLE_OPENMP
     }
+#endif// BLIS_ENABLE_OPENMP
 }
 
 INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )
diff --git a/frame/2/gemv/bli_gemv_unf_var2_amd.c b/frame/2/gemv/bli_gemv_unf_var2_amd.c
index 831d906ca4..c4317dd4d1 100644
--- a/frame/2/gemv/bli_gemv_unf_var2_amd.c
+++ b/frame/2/gemv/bli_gemv_unf_var2_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -177,9 +177,9 @@ void bli_dgemv_unf_var2
 
     conja = bli_extract_conj( transa );
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         const num_t dt = PASTEMAC(d,type);
         double*  x1;
@@ -247,7 +247,7 @@ void bli_dgemv_unf_var2
 
     /* If beta is zero, use setv. Otherwise, scale by beta. */
         /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
+    /* beta=0 case is handled by scalv internally */
 
     bli_dscalv_zen_int10
     (
@@ -448,9 +448,9 @@ void bli_sgemv_unf_var2
 
     conja = bli_extract_conj( transa );
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         const num_t dt = PASTEMAC(s,type);
         /* If beta is zero, use setv. Otherwise, scale by beta. */
@@ -516,7 +516,7 @@ void bli_sgemv_unf_var2
 
     /* If beta is zero, use setv. Otherwise, scale by beta. */
         /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
+    /* beta=0 case is handled by scalv internally */
     bli_sscalv_zen_int10
     (
       BLIS_NO_CONJUGATE,
@@ -576,175 +576,225 @@ void bli_zgemv_unf_var2
      )
 {
 
-    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
-    dcomplex*  A1;
-    dcomplex*  x1;
-    dcomplex*  y1;
-    dim_t   i;
-    dim_t   b_fuse, f;
-    dim_t   n_elem, n_iter;
-    inc_t   rs_at, cs_at;
-    conj_t  conja;
-
-    // For AMD these APIS are invoked skipping intermediate framework layers
-    // Hence we need to ensure that cntx is set here.
-    bli_init_once();
-    if(cntx == NULL) cntx = bli_gks_query_cntx();
-
-    bli_set_dims_incs_with_trans( transa,
-                                  m, n, rs_a, cs_a,
-                                  &n_elem, &n_iter, &rs_at, &cs_at );
-
-    conja = bli_extract_conj( transa );
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
+
+  dcomplex *A1;
+  dcomplex *x1;
+  dcomplex *y1;
+
+  dim_t i, b_fuse, f;
+  dim_t n_elem, n_iter;
+  inc_t rs_at, cs_at;
+  conj_t conja;
+
+  // Memory pool declarations for packing vector Y.
+  mem_t mem_bufY;
+  rntm_t rntm;
+  dcomplex *y_buf = y;
+  inc_t buf_incy = incy;
+
+  bli_set_dims_incs_with_trans(transa,
+                                m, n, rs_a, cs_a,
+                                &n_elem, &n_iter, &rs_at, &cs_at);
+
+  conja = bli_extract_conj(transa);
+
+  // Query the architecture ID
+  arch_t id = bli_arch_query_id();
+
+  /*
+    Function pointer declaration for the functions
+    that will be used by this API
+  */
+  zaxpyf_ker_ft   axpyf_kr_ptr; // ZAXPYF
+  zscal2v_ker_ft  scal2v_kr_ptr; // ZSCAL2V
+  zscalv_ker_ft   scalv_kr_ptr; // ZSCALV
+  zcopyv_ker_ft   copyv_kr_ptr; // ZCOPYV
+
+  /*
+    Boolean to check if the y has been packed
+    and memory needs to be freed in the end
+  */
+  bool is_y_temp_buf_created = FALSE;
+
+  switch (id)
+  {
+    case BLIS_ARCH_ZEN4:
+    case BLIS_ARCH_ZEN:
+    case BLIS_ARCH_ZEN2:
+    case BLIS_ARCH_ZEN3:
+
+      /*
+        Assign the AVX2 based kernel function pointers for
+        ZAXPYF, ZSCAL2V, ZSCALV, ZCOPYV and corresponding fusing
+        factor of ZAXPYF kernel
+      */
+
+      axpyf_kr_ptr = bli_zaxpyf_zen_int_4;
+      b_fuse = 4;
+
+      scal2v_kr_ptr = bli_zscal2v_zen_int;
+
+      scalv_kr_ptr = bli_zscalv_zen_int;
+
+      copyv_kr_ptr = bli_zcopyv_zen_int;
+
+      break;
+    default:
+      // For non-Zen architectures, query the context if it is NULL
+      if(cntx == NULL) cntx = bli_gks_query_cntx();
+
+      /*
+        Query the context for the kernel function pointers for
+        ZAXPYF, ZSCAL2V, ZSCALV, ZCOPYV and corresponding fusing
+        factor of ZAXPYF kernel
+      */
+      axpyf_kr_ptr = bli_cntx_get_l1f_ker_dt(BLIS_DCOMPLEX, BLIS_AXPYF_KER, cntx);
+      b_fuse = bli_cntx_get_blksz_def_dt(BLIS_DCOMPLEX, BLIS_AF, cntx);
+
+      scal2v_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCAL2V_KER, cntx);
+
+      scalv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
+
+      copyv_kr_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_COPYV_KER, cntx);
+  }
+
+  /*
+    If alpha is equal to zero, y = beta * y + alpha * A * x
+    becomes y = beat * y in that case packing will be costly.
+    y is only scaled with SCALV and returned.
+  */
+  if (incy > 1 && (!bli_zeq0(*alpha)))
+  {
+    /*
+      Initialize mem pool buffer to NULL and size to 0
+      "buf" and "size" fields are assigned once memory
+      is allocated from the pool in bli_membrk_acquire_m().
+      This will ensure bli_mem_is_alloc() will be passed on
+      an allocated memory if created or a NULL .
+    */
+    mem_bufY.pblk.buf = NULL;
+    mem_bufY.pblk.block_size = 0;
+    mem_bufY.buf_type = 0;
+    mem_bufY.size = 0;
+    mem_bufY.pool = NULL;
+
+    /*
+      In order to get the buffer from pool via rntm access to memory broker
+      is needed.Following are initializations for rntm
+    */
+
+    bli_rntm_init_from_global(&rntm);
+    bli_rntm_set_num_threads_only(1, &rntm);
+    bli_membrk_rntm_set_membrk(&rntm);
+
+    // Calculate the size required for n_elem double elements in vector Y.
+    size_t buffer_size = n_elem * sizeof(dcomplex);
+
+#ifdef BLIS_ENABLE_MEM_TRACING
+    printf("bli_zgemv_unf_var2(): get mem pool block\n");
+#endif
+
+    /*
+      Acquire a Buffer(n_elem*size(double)) from the memory broker
+      and save the associated mem_t entry to mem_bufY.
+    */
+    bli_membrk_acquire_m(&rntm,
+                          buffer_size,
+                          BLIS_BUFFER_FOR_B_PANEL,
+                          &mem_bufY);
+
+    /* Continue packing Y if buffer memory is allocated */
+    if ((bli_mem_is_alloc(&mem_bufY)))
+    {
+      y_buf = bli_mem_buffer(&mem_bufY);
+      buf_incy = 1;
 
-    /* If beta is zero, use setv. Otherwise, scale by beta. */
-      /* y = beta * y; */
+      // Invoke the ZSCAL2V function using the function pointer
+      scal2v_kr_ptr
+      (
+        BLIS_NO_CONJUGATE,
+        n_elem,
+        beta,
+        y, incy,
+        y_buf, buf_incy,
+        cntx
+      );
 
-    /* beta=0 case is hadled by scalv internally */
-    /*    bli_zscalv_zen_int10
+      /*
+        Set y is packed as the memory allocation was
+        successful and contents have been copied
+      */
+      is_y_temp_buf_created = TRUE;
+    }
+  }
+  else
+  {
+    // Invoke the ZSCALV function using the function pointer
+    scalv_kr_ptr
     (
       BLIS_NO_CONJUGATE,
       n_elem,
       beta,
-      y,
-      incy,
+      y_buf, buf_incy,
       cntx
-    );*/
-
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
-    {
-        const num_t dt = PASTEMAC(z,type);
-        /* If beta is zero, use setv. Otherwise, scale by beta. */
-        if ( PASTEMAC(z,eq0)( *beta ) )
-        {
-            dcomplex*  zero = PASTEMAC(z,0);
-            /* y = 0; */
-            PASTEMAC2(z,setv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              zero,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-        else
-        {
-            /* y = beta * y; */
-            PASTEMAC2(z,scalv,BLIS_TAPI_EX_SUF)
-            (
-              BLIS_NO_CONJUGATE,
-              n_elem,
-              beta,
-              y, incy,
-              cntx,
-              NULL
-            );
-        }
-
-        PASTECH(z,axpyf_ker_ft) kfp_af;
+    );
+  }
 
-        /* Query the context for the kernel function pointer and fusing factor. */
-        kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx );
-        b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx );
+  // If alpha is zero(0), we only need to scalv y and return
+  if (bli_zeq0(*alpha))
+  {
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
 
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
+    // Return early for alpha is zero(0)
+    return;
+  }
 
-            A1 = a + (0  )*rs_at + (i  )*cs_at;
-            x1 = x + (i  )*incx;
-            y1 = y + (0  )*incy;
+  for (i = 0; i < n_iter; i += f)
+  {
+    f = bli_determine_blocksize_dim_f(i, n_iter, b_fuse);
+    A1 = a + (0) * rs_at + (i)*cs_at;
+    x1 = x + (i)*incx;
+    y1 = y_buf + (0) * buf_incy;
 
-            /* y = y + alpha * A1 * x1; */
-            kfp_af
-            (
-              conja,
-              conjx,
-              n_elem,
-              f,
-              alpha,
-              A1, rs_at, cs_at,
-              x1, incx,
-              y1, incy,
-              cntx
-            );
-        }
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-        return;
-    }
-
-    bli_zscalv_ex
+    // Invoke the ZAXPYF function using the function pointer
+    axpyf_kr_ptr
+    (
+      conja,
+      conjx,
+      n_elem,
+      f,
+      alpha,
+      A1, rs_at, cs_at,
+      x1, incx,
+      y1, buf_incy,
+      cntx
+    );
+  }
+
+  // Check if temp y buffer was used for compute
+  if (is_y_temp_buf_created)
+  {
+    // Store the result from unit strided y_buf to non-unit strided Y
+    // Invoke the ZCOPYV function using the function pointer
+    copyv_kr_ptr
     (
       BLIS_NO_CONJUGATE,
       n_elem,
-      beta,
+      y_buf, buf_incy,
       y, incy,
-      cntx,
-      NULL
+      cntx
     );
 
-    if( bli_zeq0( *alpha ) )
-    {
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
-        return;
-    }
+#ifdef BLIS_ENABLE_MEM_TRACING
+    printf("bli_zgemv_unf_var2(): releasing mem pool block\n");
+#endif
 
-    // for non-unit incx, incy and rs_at and conjugate will be added in the next patch
-    if( (incx == 1 && incy == 1 && rs_at == 1 ) &&
-         !bli_is_conj(conja) && !bli_is_conj(conjx) && !bli_is_trans(transa))
-    {
-        // This gemv code deals with the followint conditions only
-        // 1. incx, incy, and row stride equal to one
-        // 2. Non conjugate A matrix and X vector
-        // 3. No Transpose for A Martix
-        // Rest is taken care by the else part (axpyf implementation)
-        bli_zgemv_zen_int_4x4
-        (
-            conja,
-            conjx,
-            m,
-            n,
-            alpha,
-            a, rs_at, cs_at,
-            x, incx,
-            beta,
-            y, incy,
-            cntx
-        );
-    }
-    else
-    {
-        /* fusing factor */
-        b_fuse = 4;
-
-        for ( i = 0; i < n_iter; i += f )
-        {
-            f  = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
-            A1 = a + (0  )*rs_at + (i  )*cs_at;
-            x1 = x + (i  )*incx;
-            y1 = y + (0  )*incy;
-
-            /* y = y + alpha * A1 * x1; */
-            bli_zaxpyf_zen_int_4
-            (
-                conja,
-                conjx,
-                n_elem,
-                f,
-                alpha,
-                A1, rs_at, cs_at,
-                x1, incx,
-                y1, incy,
-                cntx
-            );
-        }
-    }
+    // Return the buffer to pool
+    bli_membrk_release(&rntm, &mem_bufY);
+  }
 
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
 }
 
 void bli_cgemv_unf_var2
@@ -785,7 +835,7 @@ void bli_cgemv_unf_var2
 
     /* If beta is zero, use setv. Otherwise, scale by beta. */
         /* y = beta * y; */
-    /* beta=0 case is hadled by scalv internally */
+    /* beta=0 case is handled by scalv internally */
     /*bli_cscalv_zen_int10
     (
       BLIS_NO_CONJUGATE,
@@ -796,9 +846,9 @@ void bli_cgemv_unf_var2
       cntx
     );*/
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         const num_t dt = PASTEMAC(c,type);
         /* If beta is zero, use setv. Otherwise, scale by beta. */
diff --git a/frame/2/hemv/CMakeLists.txt b/frame/2/hemv/CMakeLists.txt
index 10e324b52d..c1de90e047 100644
--- a/frame/2/hemv/CMakeLists.txt
+++ b/frame/2/hemv/CMakeLists.txt
@@ -28,4 +28,4 @@ else()
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unf_var1.c
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_hemv_unf_var3.c
      )
-endif()
\ No newline at end of file
+endif()
diff --git a/frame/2/hemv/bli_hemv_unf_var1_amd.c b/frame/2/hemv/bli_hemv_unf_var1_amd.c
index 6532323d11..7c7f67ab89 100644
--- a/frame/2/hemv/bli_hemv_unf_var1_amd.c
+++ b/frame/2/hemv/bli_hemv_unf_var1_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021-22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -73,7 +73,7 @@ void PASTEMAC(ch,varname) \
 	ctype   conjx_chi11; \
 	ctype   alpha_chi11; \
 	ctype   alpha11_temp; \
-	dim_t   i, k, j; \
+	siz_t   i, k, j; \
 	dim_t   b_fuse, f; \
 	dim_t   n_behind; \
 	dim_t   f_ahead, f_behind; \
@@ -316,9 +316,9 @@ void bli_dhemv_unf_var1
 	 * factor. */
 	/* Assign kernel function pointer and fusing factor. */
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	{
 		kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
 		b_fuse = 8;
diff --git a/frame/2/hemv/bli_hemv_unf_var3_amd.c b/frame/2/hemv/bli_hemv_unf_var3_amd.c
index 34d40cf5cc..6ce5ae972a 100644
--- a/frame/2/hemv/bli_hemv_unf_var3_amd.c
+++ b/frame/2/hemv/bli_hemv_unf_var3_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -73,7 +73,7 @@ void PASTEMAC(ch,varname) \
 	ctype   conjx_chi11; \
 	ctype   alpha_chi11; \
 	ctype   alpha11_temp; \
-	dim_t   i, k, j; \
+	siz_t   i, k, j; \
 	dim_t   b_fuse, f; \
 	dim_t   n_ahead; \
 	dim_t   f_ahead, f_behind; \
@@ -312,9 +312,9 @@ void bli_dhemv_unf_var3
 
 	PASTECH(d,dotxaxpyf_ker_ft) kfp_dotxaxpyf_ker;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	{
 		kfp_dotxaxpyf_ker = bli_ddotxaxpyf_zen_int_8;
 		b_fuse = 8;
diff --git a/frame/2/her/CMakeLists.txt b/frame/2/her/CMakeLists.txt
index b97ee3874b..0e0f636681 100644
--- a/frame/2/her/CMakeLists.txt
+++ b/frame/2/her/CMakeLists.txt
@@ -22,4 +22,4 @@ else()
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_unb_var1.c
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_unb_var2.c
      )
-endif()
\ No newline at end of file
+endif()
diff --git a/frame/2/her/bli_her_unb_var1_amd.c b/frame/2/her/bli_her_unb_var1_amd.c
index 1dcb6d0eeb..eda21ee416 100644
--- a/frame/2/her/bli_her_unb_var1_amd.c
+++ b/frame/2/her/bli_her_unb_var1_amd.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -164,7 +164,7 @@ void PASTEMAC(ch,varname) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
 	/* Redirect to intrinsic implementation of HER for dcomplex */ \
-	if ( bli_cpuid_is_avx_supported() == TRUE && \
+	if ( bli_cpuid_is_avx2fma3_supported() == TRUE && \
              ( rs_c == 1 || cs_c == 1 ) && \
              ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ) && \
              bli_is_conj(conjh) && incx == 1 ) \
@@ -280,4 +280,4 @@ void PASTEMAC(ch,varname) \
 		} \
 	} \
 }
-GENTFUNC( dcomplex, z, her_unb_var1 )
\ No newline at end of file
+GENTFUNC( dcomplex, z, her_unb_var1 )
diff --git a/frame/2/her/bli_her_unb_var2_amd.c b/frame/2/her/bli_her_unb_var2_amd.c
index f16ef42a76..8ab95136e9 100644
--- a/frame/2/her/bli_her_unb_var2_amd.c
+++ b/frame/2/her/bli_her_unb_var2_amd.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -164,7 +164,7 @@ void PASTEMAC(ch,varname) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
 	/* Redirect to intrinsic implementation of HER for unit increment */ \
-	if ( bli_cpuid_is_avx_supported() == TRUE && \
+	if ( bli_cpuid_is_avx2fma3_supported() == TRUE && \
              ( rs_c == 1 || cs_c == 1 ) && \
              ( bli_is_upper( uplo ) || bli_is_lower( uplo ) ) && \
              bli_is_conj(conjh) && incx == 1 ) \
@@ -280,4 +280,4 @@ void PASTEMAC(ch,varname) \
 		} \
 	} \
 }
-GENTFUNC( dcomplex, z, her_unb_var2 )
\ No newline at end of file
+GENTFUNC( dcomplex, z, her_unb_var2 )
diff --git a/frame/2/her2/CMakeLists.txt b/frame/2/her2/CMakeLists.txt
index cfdeb2480d..817e55cb10 100644
--- a/frame/2/her2/CMakeLists.txt
+++ b/frame/2/her2/CMakeLists.txt
@@ -26,4 +26,4 @@ else()
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unf_var1.c
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_unf_var4.c
      )
-endif()
\ No newline at end of file
+endif()
diff --git a/frame/2/her2/bli_her2_unf_var1_amd.c b/frame/2/her2/bli_her2_unf_var1_amd.c
index 31667cc3e4..b1ccf069b0 100644
--- a/frame/2/her2/bli_her2_unf_var1_amd.c
+++ b/frame/2/her2/bli_her2_unf_var1_amd.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -252,7 +252,7 @@ void bli_dher2_unf_var1
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
 
-	if ( (bli_cpuid_is_avx_supported() == TRUE)
+	if ( (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	   && (incx == 1)
 	   && (incy == 1)
 	   && (rs_ct == 1))
diff --git a/frame/2/her2/bli_her2_unf_var4_amd.c b/frame/2/her2/bli_her2_unf_var4_amd.c
index 6e999be7d1..7d6f93ead6 100644
--- a/frame/2/her2/bli_her2_unf_var4_amd.c
+++ b/frame/2/her2/bli_her2_unf_var4_amd.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -249,7 +249,7 @@ void bli_dher2_unf_var4
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx );
 
-	if ( (bli_cpuid_is_avx_supported() == TRUE)
+	if ( (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	   && (incx == 1)
 	   && (incy == 1)
 	   && (rs_ct == 1))
diff --git a/frame/2/trsv/bli_trsv_unf_var1_amd.c b/frame/2/trsv/bli_trsv_unf_var1_amd.c
index 4f026f2c6a..cf17eece24 100644
--- a/frame/2/trsv/bli_trsv_unf_var1_amd.c
+++ b/frame/2/trsv/bli_trsv_unf_var1_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -295,9 +295,9 @@ void bli_dtrsv_unf_var1
 
     PASTECH(d,dotxf_ker_ft) kfp_df;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_df = bli_ddotxf_zen_int_8;
 	    b_fuse = 8;
     }
@@ -496,9 +496,9 @@ void bli_strsv_unf_var1
 
     PASTECH(s,dotxf_ker_ft) kfp_df;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_df = bli_sdotxf_zen_int_8;
 	    b_fuse = 8;
     }
diff --git a/frame/2/trsv/bli_trsv_unf_var2_amd.c b/frame/2/trsv/bli_trsv_unf_var2_amd.c
index 51bbcabab7..aa56ed1523 100644
--- a/frame/2/trsv/bli_trsv_unf_var2_amd.c
+++ b/frame/2/trsv/bli_trsv_unf_var2_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -297,9 +297,9 @@ void bli_dtrsv_unf_var2
 
     PASTECH(d,axpyf_ker_ft) kfp_af;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_af = bli_daxpyf_zen_int_16x4;
 	    b_fuse = 4;
     }
@@ -496,9 +496,9 @@ void bli_strsv_unf_var2
 
     PASTECH(s, axpyf_ker_ft) kfp_af;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_af = bli_saxpyf_zen_int_5;
 	    b_fuse = 5;
     }
@@ -695,9 +695,9 @@ void bli_ztrsv_unf_var2
 
     PASTECH(z, axpyf_ker_ft) kfp_af;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_af = bli_zaxpyf_zen_int_5;
 	    b_fuse = 5;
     }
@@ -893,9 +893,9 @@ void bli_ctrsv_unf_var2
 
     PASTECH(c, axpyf_ker_ft) kfp_af;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    kfp_af = bli_caxpyf_zen_int_5;
 	    b_fuse = 5;
     }
diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c
index f8349f8810..2a04ef14d8 100644
--- a/frame/3/bli_l3_oapi.c
+++ b/frame/3/bli_l3_oapi.c
@@ -189,7 +189,7 @@ void PASTEMAC(opname,EX_SUF) \
 	     bli_obj_is_complex( a ) && \
 	     bli_obj_is_complex( b ) ) \
 	{ \
-        /* GEMMT Todo: Currently we support only native implemenation
+        /* GEMMT Todo: Currently we support only native implementation
          for complex datatypes.*/ \
 		PASTEMAC(opname,nat)( alpha, a, b, beta, c, cntx, rntm ); \
 	} \
diff --git a/frame/3/bli_l3_packm.c b/frame/3/bli_l3_packm.c
index 1134bdc1fd..5d6914ec51 100644
--- a/frame/3/bli_l3_packm.c
+++ b/frame/3/bli_l3_packm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -164,7 +164,7 @@ void bli_l3_packm
 	// with the mem_t entry acquired from the memory broker (now cached in
 	// the control tree node).
 	void* buf = bli_mem_buffer( cntl_mem_p );
-    bli_obj_set_buffer( buf, x_pack );
+	bli_obj_set_buffer( buf, x_pack );
 
 
 	// Pack the contents of object x to object x_pack.
diff --git a/frame/3/bli_l3_smart_threading.c b/frame/3/bli_l3_smart_threading.c
index e4b9b43e24..942f50df7e 100644
--- a/frame/3/bli_l3_smart_threading.c
+++ b/frame/3/bli_l3_smart_threading.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -134,6 +134,50 @@ err_t bli_check_and_transform_native_to_SUP
        rntm_t* rntm
      );
 
+static err_t bli_gemm_ic_jc_optimum_sup_zen4
+     (
+       num_t dt,
+       siz_t elem_size,
+       const bool is_rrr_rrc_rcr_crr,
+       const dim_t m,
+       const dim_t n,
+       const dim_t k,
+       const dim_t max_available_nt,
+       cntx_t* cntx,
+       rntm_t* rntm
+     );
+
+static void bli_gemm_cache_heur_adjust_ic_jc_sup_zen4
+     (
+       const dim_t m,
+       const dim_t n,
+       const dim_t k,
+       dim_t nt,
+       dim_t* ic,
+       dim_t* jc,
+       const dim_t MR,
+       const dim_t NR,
+       const dim_t MC,
+       const dim_t KC
+     );
+
+err_t bli_check_and_transform_native_to_SUP_zen4
+     (
+       num_t dt,
+       siz_t elem_size,
+       const bool is_rrr_rrc_rcr_crr,
+       const dim_t m,
+       const dim_t n,
+       const dim_t k,
+       dim_t ic,
+       dim_t jc,
+       const dim_t NR,
+       const dim_t MC,
+       const dim_t KC,
+       cntx_t* cntx,
+       rntm_t* rntm
+     );
+
 err_t bli_gemm_smart_threading_sup
      (
        num_t dt,
@@ -149,7 +193,7 @@ err_t bli_gemm_smart_threading_sup
 {
 	err_t ret_val = BLIS_FAILURE;
 
-	// Sanity check, max available threads should be atleast 4 for the
+	// Sanity check, max available threads should be at least 4 for the
 	// smart threading/factorization to be meaningful. For nt < 4 the
 	// default ic,jc factorization holds good.
 	if ( ( m <= 1 ) || ( n <= 1 ) ||  ( k <= 1 ) || ( max_available_nt < 4 ) )
@@ -206,6 +250,14 @@ static err_t bli_gemm_ic_jc_optimum_sup_arch_dispatcher
 				    max_available_nt, cntx, rntm
 				  );
 	}
+	else if ( id == BLIS_ARCH_ZEN4 )
+	{
+		ret_val = bli_gemm_ic_jc_optimum_sup_zen4
+				  (
+				    dt, elem_size, is_rrr_rrc_rcr_crr,  m, n, k,
+				    max_available_nt, cntx, rntm
+				  );
+	}
 	else
 	{
 		// Other architectures not supported for now.
@@ -554,4 +606,343 @@ err_t bli_check_and_transform_native_to_SUP
 }
 // close zen3 region.
 
+// begin zen4 region
+#define NUM_CORES_PER_CCD_ZEN4 12
+
+// Determines the optimal number of threads (nt) and corresponding work split
+//  (ic,jc factorization of nt) for gemm on zen4 machines.
+static err_t bli_gemm_ic_jc_optimum_sup_zen4
+     (
+       num_t dt,
+       siz_t elem_size,
+       const bool is_rrr_rrc_rcr_crr,
+       const dim_t m,
+       const dim_t n,
+       const dim_t k,
+       const dim_t max_available_nt,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	err_t ret_val = BLIS_SUCCESS;
+
+	const dim_t MR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t NR = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t MC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t NC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t KC = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
+	dim_t ic = -1;
+	dim_t jc = -1;
+
+	// Partitioning nt into m & n such that m/n ~= work1/work2.
+	bli_thread_partition_2x2( max_available_nt, m, n, &ic, &jc );
+
+	dim_t jc_per_ccd = ( NUM_CORES_PER_CCD_ZEN4 + ic - 1 ) / ic ;
+	dim_t b_mat_data_per_ccd = jc_per_ccd * ( n / jc );
+
+	// All the cores (12) on a CCD share a L3 cache and hence total data
+	// loaded by the cores on a CCD should be < NC to avoid L3 contention.
+	// In cases where it is violated, it is better to increase ic and
+	// reduce B data per CCD, using micro panels mu, nu for thread
+	// partitioning can help achieve this. Avoiding further ic,jc
+	// adjustment in this case.
+	if ( b_mat_data_per_ccd > NC )
+	{
+		const dim_t mu = m / MR;
+		const dim_t nu = n / NR;
+		bli_thread_partition_2x2( max_available_nt, mu, nu, &ic, &jc );
+	}
+	else
+	{
+		// Adjust the ic,jc in the best match so that m_ic and n_jc
+		// turns out to be more cache friendly.
+		bli_gemm_cache_heur_adjust_ic_jc_sup_zen4
+		(
+		  m, n, k, max_available_nt, &ic, &jc, MR, NR, MC, KC
+		);
+	}
+
+	ret_val = bli_check_and_transform_native_to_SUP_zen4
+			  (
+			    dt, elem_size, is_rrr_rrc_rcr_crr, m, n, k,
+			    ic, jc, NR, MC, KC, cntx, rntm
+			  );
+
+	if ( ret_val == BLIS_SUCCESS )
+	{
+		bli_rntm_set_ic_ways_only( ic, rntm );
+		bli_rntm_set_jc_ways_only( jc, rntm );
+	}
+
+	return ret_val;
+}
+
+// The factorization of nt into ic,jc is based on m and n values (for simplicity
+// it can be assumed to be based on m:n ratio). It does not take into account
+// how the matrices are loaded into cache or which matrix goes to the larger
+// cache. Depending on the matrix dimensions, increasing the ic can result in
+// reduced loads from main memory to L2 cache for A matrix without any impact on
+// B matrix load (since B is streamed into L3, which is larger). Similary
+// adjusting jc can result in B matrix panels fitting perfectly within the L1
+// cache.This function makes these adjustments on ic,jc.
+static void bli_gemm_cache_heur_adjust_ic_jc_sup_zen4
+     (
+       const dim_t m,
+       const dim_t n,
+       const dim_t k,
+       dim_t nt,
+       dim_t* ic,
+       dim_t* jc,
+       const dim_t MR,
+       const dim_t NR,
+       const dim_t MC,
+       const dim_t KC
+     )
+{
+	const dim_t m_ic = m / ( *ic );
+	const dim_t n_jc = n / ( *jc );
+	const int64_t cur_work_per_thread = m_ic + n_jc;
+
+	// The next and prev factors are caluclated with respect to the current
+	// factor part of nt. In effect
+	// 1. next ic * prev jc = nt
+	// 2. prev ic * next jc = nt
+	// 3. ic * jc = nt
+	const dim_t next_ic = next_factor( nt, ( *ic ) );
+	const dim_t prev_ic = prev_factor( nt, ( *ic ) );
+	const dim_t next_jc = next_factor( nt, ( *jc ) );
+	const dim_t prev_jc = prev_factor( nt, ( *jc ) );
+
+	const dim_t m_next_ic = m / next_ic;
+	const dim_t m_prev_ic = m / prev_ic;
+	const dim_t n_next_jc = n / next_jc;
+	const dim_t n_prev_jc = n / prev_jc;
+	const dim_t n_jc_modulo_NR = n_jc % NR;
+	const dim_t n_prev_jc_modulo_NR = n_prev_jc % NR;
+
+	const int64_t next_jc_work_per_thread = n_next_jc + m_prev_ic;
+	const int64_t next_ic_work_per_thread = m_next_ic + n_prev_jc;
+
+	const dim_t MCx2 = MC * 2;
+	const dim_t NRx4 = NR * 4;
+	const dim_t NRx8 = NR * 8;
+
+	// MC will be reduced if the following mods are zero. Incrementing jc
+	// helps in this case.
+	const dim_t n_mod_256 = n % 256;
+	const dim_t k_mod_256 = k % 256;
+
+	const dim_t k_factor = k / KC;
+
+	bool can_increase_jc = FALSE;
+	bool can_increase_ic = FALSE;
+
+	// jc adjustment towards next highest factor if it results in n_jc*KC
+	// fittting completely within l1d cache. Only done if ic prev factor
+	// does not move m_prev_ic out of good l2 load zone (MC).
+	// Performance improvement also observed when n_jc is a multiple of NR.
+	if ( ( ( *ic ) > 1 ) && ( ( *jc ) < nt ) )
+	{
+		// Check whether m_prev_ic remains in good l2 load zone.
+		if ( ( ( ( m_ic <= MC ) && ( m_prev_ic <= MC ) ) ||
+			   ( m_ic > MC ) ) &&
+			 ( ( n_jc > NR ) && ( n_next_jc == NR ) ) )
+		{
+			can_increase_jc = TRUE;
+		}
+		// 2x2 factorization doesnt always give equal sum partition.
+		else if ( next_jc_work_per_thread < cur_work_per_thread )
+		{
+			can_increase_jc = TRUE;
+		}
+	}
+
+	// Favor jc if both n and k are multiples of 256 ( high cache line
+	// replacement ).
+	if ( ( ( *ic ) < nt ) && ( ( *jc ) > 1) )
+	{
+		// ic adjustment towards next highest factor if it results in
+		// m_next_ic <= MC. This helps in reducing number of A matrix
+		// loads per thread to l2 from main memory.
+		if ( ( m_ic > MC ) && ( m_next_ic <= MC ) &&
+			 ( m_next_ic >= MR ) && ( k_factor > 4 ) )
+		{
+			can_increase_ic = TRUE;
+		}
+		// ic adjustment towards next highest factor resulted in better
+		// performance when m is sufficiently larger than n and jc prev
+		// factor did not result in n_prev_jc moving out of good l2
+		// load zone (n_jc < 64).
+		else if ( ( m > ( 5 * n ) ) && ( m_ic >= MCx2 ) && ( k_factor > 4 ) &&
+				  ( ( n_jc > NRx4 ) ||
+					( ( n_jc <= NRx4 ) && ( n_prev_jc <= NRx4 ) ) ) )
+		{
+			can_increase_ic = TRUE;
+		}
+		// Performance improvement also observed when n_jc is a multiple
+		// of NR.
+		else if ( ( n_jc_modulo_NR != 0 ) && ( n_prev_jc_modulo_NR == 0 ) &&
+				  ( k_factor > 4 ) )
+		{
+			can_increase_ic = TRUE;
+		}
+		// 2x2 factorization doesnt always give equal sum partition.
+		else if ( next_ic_work_per_thread <= cur_work_per_thread )
+		{
+			can_increase_ic = TRUE;
+		}
+	}
+
+	// Favor jc if both n and k are multiples of 256 ( high cache line
+	// replacement ).
+	if ( ( n_mod_256 == 0 ) && ( k_mod_256 == 0 ) && ( k > KC ) )
+	{
+		if ( can_increase_ic == TRUE )
+		{
+			can_increase_ic = FALSE;
+		}
+		else if ( can_increase_jc == FALSE )
+		{
+			can_increase_jc = TRUE;
+		}
+	}
+	// If only one of either n or k is a multiple of 256, favour jc if n per
+	// thread is within a heuristic factor of NR.
+	else if ( ( ( n_mod_256 == 0 ) || ( k_mod_256 == 0 ) ) && ( k > KC ) )
+	{
+		if ( ( can_increase_ic == TRUE ) && ( n_jc <= NRx8 ) )
+		{
+			can_increase_ic = FALSE;
+		}
+		else if ( ( can_increase_jc == FALSE ) && ( n_next_jc <= NRx8 ) )
+		{
+			can_increase_jc = TRUE;
+		}
+	}
+
+	// Increasing ic factor is given a higher priority compared to jc
+	// since it was observed that the A matrix loads (main memory -> l2) had
+	// more impact on perf compared to B matrix (main memory -> l3 -> l1)
+	// for the sizes considered.
+	if ( can_increase_ic )
+	{
+		// It is expected that the larger dimension (m or n) will be
+		// allocated a larger share of the thread factorization.
+		if ( ( ( m >= n ) && ( next_ic >= prev_jc ) ) ||
+		     ( ( m <= n ) && ( next_ic <= prev_jc ) ) )
+		{
+			*ic = next_ic;
+			*jc = prev_jc;
+		}
+	}
+	else if ( can_increase_jc )
+	{
+		// It is expected that the larger dimension (m or n) will be
+		// allocated a larger share of the thread factorization.
+		if ( ( ( m >= n ) && ( prev_ic >= next_jc ) ) ||
+		     ( ( m <= n ) && ( prev_ic <= next_jc ) ) )
+		{
+			*ic = prev_ic;
+			*jc = next_jc;
+		}
+	}
+}
+
+// It was observed that the SUP thresholds can be lowered and applied on a
+// per thread basis in multi threaded scenarios.
+err_t bli_check_and_transform_native_to_SUP_zen4
+     (
+       num_t dt,
+       siz_t elem_size,
+       const bool is_rrr_rrc_rcr_crr,
+       const dim_t m,
+       const dim_t n,
+       const dim_t k,
+       dim_t ic,
+       dim_t jc,
+       const dim_t NR,
+       const dim_t MC,
+       const dim_t KC,
+       cntx_t* cntx,
+       rntm_t* rntm
+     )
+{
+	err_t ret_val = BLIS_FAILURE;
+	dim_t m_ic;
+	dim_t n_jc;
+
+	const dim_t MT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx );
+	const dim_t NT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx );
+	const dim_t KT = bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx );
+
+	const dim_t MT_2 = MT / 2;
+	const dim_t NTx4 = NT * 4;
+	const dim_t NRx8 = NR * 8;
+
+	const dim_t page_size = bli_info_get_page_size();
+	const dim_t page_size_b_float = page_size / ( dim_t ) elem_size;
+	const dim_t page_size_b_floatx4 = page_size_b_float * 4;
+
+	// Default SUP check without considering per thread dimensions.
+	if ( ( k < KT ) || ( m < MT ) || ( n < NT ) )
+	{
+		ret_val = BLIS_SUCCESS;
+	}
+	// Per thread SUP limit checking.
+	else if ( ( m >= MT ) && ( n >= NT ) && ( k <= page_size_b_floatx4 ) )
+	{
+		m_ic = m / ic;
+		n_jc = n / jc;
+
+		// In multi-threaded scenario, it was observed that if the per
+		// thread m dimension(A matrix) and n dimension(B matrix) is
+		// within a factor of SUP limits, SUP path without packing
+		// resulted in gains. Along similar lines, if the B matrix is
+		// large enough and reuse is good, packing B matrix alone in SUP
+		// resulted in perf gains.
+		if ( ( m_ic <= MT_2 ) && ( n_jc < NTx4 ) )
+		{
+			if ( ( k > KC ) &&
+			     ( m_ic >= MC ) && ( n_jc >= NT ) )
+			{
+				if ( is_rrr_rrc_rcr_crr )
+				{
+					bli_rntm_set_pack_b( 1, rntm );
+				}
+				else
+				{
+					bli_rntm_set_pack_a( 1, rntm );
+				}
+			}
+			ret_val = BLIS_SUCCESS;
+		}
+		else if ( ( n_jc <= NT ) && ( m_ic <= MT ) )
+		{
+			if ( ( k > KC ) && ( m_ic >= MC ) && ( n_jc >= NRx8 ) )
+			{
+				if ( is_rrr_rrc_rcr_crr )
+				{
+					bli_rntm_set_pack_b( 1, rntm );
+				}
+				else
+				{
+					bli_rntm_set_pack_a( 1, rntm );
+				}
+			}
+			ret_val = BLIS_SUCCESS;
+		}
+		else
+		{
+			ret_val = BLIS_FAILURE;
+		}
+	}
+	else
+	{
+		ret_val = BLIS_FAILURE;
+	}
+
+	return ret_val;
+}
+// end zen4 region
 #endif
diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c
index 867ccd200c..5ee53bf951 100644
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2019-22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -73,6 +73,7 @@ err_t bli_gemmsup
     trans_t transa = bli_obj_conjtrans_status( a );
     trans_t transb = bli_obj_conjtrans_status( b );
 
+
     //Don't use sup for currently unsupported storage types in cgemmsup
     if(bli_obj_is_scomplex(c) &&
     (((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC))
@@ -86,9 +87,8 @@ err_t bli_gemmsup
 
     //Don't use sup for currently unsupported storage types  in zgemmsup
     if(bli_obj_is_dcomplex(c) &&
-    (((stor_id == BLIS_RRC)||(stor_id == BLIS_CRC))
-    || ((transa == BLIS_CONJ_NO_TRANSPOSE) || (transa == BLIS_CONJ_TRANSPOSE))
-    || ((transb == BLIS_CONJ_NO_TRANSPOSE) || (transb == BLIS_CONJ_TRANSPOSE))
+    (((transa == BLIS_CONJ_NO_TRANSPOSE) || (transa == BLIS_CONJ_TRANSPOSE)) ||
+     ((transb == BLIS_CONJ_NO_TRANSPOSE) || (transb == BLIS_CONJ_TRANSPOSE))
     )){
 	//printf(" gemmsup: Returning with for un-supported storage types and conjugate property in zgemmsup \n");
 	AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_2, "SUP - Unsuppported storage type for zgemm.");
@@ -96,7 +96,7 @@ err_t bli_gemmsup
     }
 
 
-    // Obtain a valid (native) context from the gks if necessary.
+    // Obtain a valid context from the gks if necessary.
     // NOTE: This must be done before calling the _check() function, since
     // that function assumes the context pointer is valid.
     if ( cntx == NULL ) cntx = bli_gks_query_cntx();
@@ -107,6 +107,21 @@ err_t bli_gemmsup
     if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
     else                { rntm_l = *rntm;                       rntm = &rntm_l; }
 
+#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
+
+    if((bli_arch_query_id() == BLIS_ARCH_ZEN4))
+    {
+        if(( bli_obj_dt(a) == BLIS_DOUBLE ) || ( bli_obj_dt(a) == BLIS_DCOMPLEX))
+        {
+            // Pack A to avoid RD kernels.
+            if((stor_id == BLIS_CRC || stor_id == BLIS_RRC))
+            {
+                bli_rntm_set_pack_a(1, rntm);//packa
+            }
+        }
+    }
+#endif
+
 #ifdef AOCL_DYNAMIC
     // Calculating optimal nt and corresponding factorization (ic,jc) here, so
     // as to determine the matrix dimensions (A - m, B - n) per thread. This
@@ -235,6 +250,8 @@ err_t bli_gemmtsup
     // that function assumes the context pointer is valid.
     if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
+    cntx_t cntx_gemmt = *cntx;
+
     thresh_func_ft func_fp;
 
     func_fp = bli_cntx_get_l3_thresh_func(BLIS_GEMMT, cntx);
@@ -251,6 +268,19 @@ err_t bli_gemmtsup
     if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
     else                { rntm_l = *rntm;                       rntm = &rntm_l; }
 
+#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
+
+    if((bli_arch_query_id() == BLIS_ARCH_ZEN4))
+    {
+        if( bli_obj_dt(a) != BLIS_SCOMPLEX )
+        {
+            // override the existing blocksizes with AVX-2 specific ones.
+            // Since gemmt has a triangular matrix as output, near-to-square
+            // shaped kernel perform better than skewed/rectangular shaped kernel.
+            bli_zen4_override_gemmt_blkszs(&cntx_gemmt);
+        }
+    }
+#endif
 #ifdef AOCL_DYNAMIC
 	// If dynamic-threading is enabled, calculate optimum number
 	// of threads and update in rntm
@@ -292,7 +322,7 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
       b,
       beta,
       c,
-      cntx,
+      &cntx_gemmt,
       rntm
     );
 
@@ -368,6 +398,8 @@ err_t bli_syrksup
     // that function assumes the context pointer is valid.
     if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
+    cntx_t cntx_syrk = *cntx;
+
     thresh_func_ft func_fp = bli_cntx_get_l3_thresh_func(BLIS_SYRK, cntx);
     if( !func_fp( a, &at_local, c, cntx))
     {
@@ -381,6 +413,20 @@ err_t bli_syrksup
     if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
     else                { rntm_l = *rntm;                       rntm = &rntm_l; }
 
+#if defined(BLIS_FAMILY_ZEN4) || defined(BLIS_FAMILY_AMDZEN)
+
+    if((bli_arch_query_id() == BLIS_ARCH_ZEN4))
+    {
+        if( bli_obj_dt(a) != BLIS_SCOMPLEX )
+        {
+            // override the existing blocksizes with AVX-2 specific ones.
+            // Since gemmt has a triangular matrix as output, near-to-square
+            // shaped kernel perform better than skewed/rectangular shaped kernel.
+            bli_zen4_override_gemmt_blkszs(&cntx_syrk);
+        }
+    }
+#endif
+
 #ifdef AOCL_DYNAMIC // Will change this name later to BLIS_SMART_THREAD
   // If dynamic-threading is enabled, calculate optimum
   // number of threads.
@@ -421,7 +467,7 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
       &at_local,
       beta,
       c,
-      cntx,
+      &cntx_syrk,
       rntm
     );
 
diff --git a/frame/3/bli_l3_sup_int_amd.c b/frame/3/bli_l3_sup_int_amd.c
index b226b135d0..029c383dc1 100644
--- a/frame/3/bli_l3_sup_int_amd.c
+++ b/frame/3/bli_l3_sup_int_amd.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2019-22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -66,6 +66,15 @@ err_t bli_gemmsup_int
 	                                     stor_id == BLIS_RRC ||
 	                                     stor_id == BLIS_RCR ||
 	                                     stor_id == BLIS_CRR );
+	const bool    is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
+	const bool    row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+	const bool    col_pref = !row_pref;
+
+	// For row-preferred kernels, rrr_rrc_rcr_crr becomes primary
+	// For col-preferred kernels, rcc_crc_ccr_ccc becomes primary
+	const bool    is_primary = ( row_pref && is_rrr_rrc_rcr_crr ) ||
+		                   ( col_pref && is_rcc_crc_ccr_ccc );
+
 	#ifdef TRACEVAR
 	if ( bli_thread_am_ochief( thread ) )
 	  printf( "bli_l3_sup_int(): var2m primary\n" );
@@ -78,15 +87,14 @@ err_t bli_gemmsup_int
 		return BLIS_FAILURE;
 	}
 
-	if ( is_rrr_rrc_rcr_crr )
+	if ( is_primary )
 	{
 	  // This branch handles:
 	  //  - rrr rrc rcr crr for row-preferential kernels
 	  //  - rcc crc ccr ccc for column-preferential kernels
-	  //  - Currently only row-preferential kernels are only supported.
 
 	  // calculate number of micropanels in m and n dimensions and
-	  // recalculate the automatic thread factorization based on these number of  micropanels 
+	  // recalculate the automatic thread factorization based on these number of  micropanels
 	  const dim_t mu = m / MR;
 	  const dim_t nu = n / NR;
 
@@ -120,13 +128,18 @@ err_t bli_gemmsup_int
 	  if (bli_is_dcomplex(dt) && (n_threads == 1))
 	  {
 		  if ((m > 55) && (k > 55) && (n > 55))
-			  bli_rntm_set_pack_b(1, rntm);//packb
+		  {
+				if ( row_pref )
+					bli_rntm_set_pack_b(1, rntm);//packb
+		  }
 	  }
 
-	  //Enable packing of B matrix for double data type when dims at per 
-	  //thread level are above caches and enable packing of A when transA 
+#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
+
+	  //Enable packing of B matrix for double data type when dims at per
+	  //thread level are above caches and enable packing of A when transA
 	  //(RRC or CRC storage ids) to avoid rd kernels
-	  if(bli_is_double(dt))
+	  if(bli_is_double(dt) && (bli_arch_query_id() == BLIS_ARCH_ZEN3))
 	  {
 		  dim_t m_pt = (m/bli_rntm_ways_for( BLIS_MC, rntm ));
 		  dim_t n_pt = (n/bli_rntm_ways_for( BLIS_NC, rntm ));
@@ -137,12 +150,12 @@ err_t bli_gemmsup_int
 			  {
 				  bli_rntm_set_pack_b(1, rntm);//packb
 
-				  if(stor_id==BLIS_RRC || stor_id==BLIS_CRC) 
+				  if(( stor_id==BLIS_RRC ) || ( stor_id==BLIS_CRC ))
 					bli_rntm_set_pack_a(1, rntm);//packa
 			  }
 		  }
 	  }
-
+#endif
 	  // Using the 1n kernel (B broadcast) gave better performance for sgemm
 	  // in single-thread scenario, given the number of n panels are
 	  // sufficiently larger than m panels.
@@ -164,7 +177,6 @@ err_t bli_gemmsup_int
 	  // This branch handles:
 	  //  - rrr rrc rcr crr for column-preferential kernels
 	  //  - rcc crc ccr ccc for row-preferential kernels
-          //  - Currently only row-preferential kernels are only supported.
 	  const dim_t mu = n / MR; // the n becomes m after a transposition
 	  const dim_t nu = m / NR; // the m becomes n after a transposition
 
@@ -183,40 +195,45 @@ err_t bli_gemmsup_int
 	      bli_l3_sup_thrinfo_update_root( rntm, thread );
 	  }
 
-	  /* Enable packing for B matrix for higher sizes. Note that pack A 
+	  /* Enable packing for B matrix for higher sizes. Note that pack A
 	   * becomes pack B inside var2m because this is transpose case*/
 	  if(bli_is_float(dt) && (n_threads==1)) {
               if((m > 240) &&  (k > 240) && (n > 240))
 	          bli_rntm_set_pack_a( 1, rntm );//packb
 	  }
 
-	  /*Enable packing of A matrix for complex data type*/
+	  //Enable packing of A matrix for complex data type
 	  if (bli_is_dcomplex(dt) && (n_threads == 1))
 	  {
 		  if ((m > 55) && (k > 55) && (n > 55))
-			  bli_rntm_set_pack_a(1, rntm);//packb
+		  {
+				if ( row_pref )
+					bli_rntm_set_pack_a(1, rntm);//packb
+		  }
 	  }
 
-	  //Enable packing of B matrix for double data type when dims at per 
-	  //thread level are above caches and enable packing of A when transA 
+#if defined(BLIS_FAMILY_ZEN3) || defined(BLIS_FAMILY_AMDZEN)
+
+	  //Enable packing of B matrix for double data type when dims at per
+	  //thread level are above caches and enable packing of A when transA
 	  //(RRC or CRC storage ids) to avoid rd kernels
-	  if(bli_is_double(dt))
+	  if(bli_is_double(dt) && (bli_arch_query_id() == BLIS_ARCH_ZEN3))
 	  {
 		  dim_t m_pt = (m/bli_rntm_ways_for( BLIS_NC, rntm ));
 		  dim_t n_pt = (n/bli_rntm_ways_for( BLIS_MC, rntm ));
 
 		  if(k > 120)
 		  {
-			  if(((m_pt > 320) && (n_pt > 120)) || ((m_pt > 120) && (n_pt > 320))) 
+			  if(((m_pt > 320) && (n_pt > 120)) || ((m_pt > 120) && (n_pt > 320)))
 			  {
 				  bli_rntm_set_pack_a(1, rntm);//packb
 
-				  if(stor_id==BLIS_RRC || stor_id==BLIS_CRC) 
+				  if(( stor_id==BLIS_RRC ) || ( stor_id==BLIS_CRC ))
 					bli_rntm_set_pack_b(1, rntm);//packa
 			  }
 		  }
 	  }
- 
+#endif
 	  if ( bli_is_float( dt ) && ( n_threads == 1 ) && ( use_pb == TRUE ) )
 	  {
 		bli_gemmsup_ref_var1n( BLIS_TRANSPOSE,
@@ -327,16 +344,15 @@ err_t bli_gemmtsup_int
 			// new ways of parallelism value for the jc loop.
 			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
 			bli_l3_sup_thrinfo_update_root( rntm, thread );
-			/* Enable packing for B matrix for higher sizes. Note that pack B
-			 * * becomes pack A inside var2m because this is transpose case*/
-			if(bli_is_double(dt) && ((n_threads==1)))
-			{
-				if((m > 320) &&  (k > 50))
-					bli_rntm_set_pack_b( 1, rntm );
-			}
-
 		}
 
+		/* Enable packing for B matrix for higher sizes. Note that pack B
+		 * * becomes pack A inside var2m because this is transpose case*/
+		if(bli_is_double(dt) && ((n_threads==1)))
+		{
+			if((m > 320) &&  (k > 50))
+				bli_rntm_set_pack_b( 1, rntm );
+		}
 
 		if ( use_bp )
 		{
@@ -401,14 +417,14 @@ err_t bli_gemmtsup_int
 			// new ways of parallelism value for the jc loop.
 			bli_rntm_set_ways_only( jc_new, 1, ic_new, 1, 1, rntm );
 			bli_l3_sup_thrinfo_update_root( rntm, thread );
+		}
 
-			/* Enable packing for A matrix for higher sizes. Note that pack A
-			 * * becomes pack B inside var2m because this is transpose case*/
-			if(bli_is_double(dt) && (n_threads==1))
-			{
-				if((m > 320) &&  (k > 50))
-					bli_rntm_set_pack_a( 1, rntm );
-			}
+		/* Enable packing for A matrix for higher sizes. Note that pack A
+		 * * becomes pack B inside var2m because this is transpose case*/
+		if(bli_is_double(dt) && (n_threads==1))
+		{
+			if((m > 320) &&  (k > 50))
+				bli_rntm_set_pack_a( 1, rntm );
 		}
 
 
diff --git a/frame/3/bli_l3_sup_ker_prot.h b/frame/3/bli_l3_sup_ker_prot.h
index 899a47d3fa..9643e04bd5 100644
--- a/frame/3/bli_l3_sup_ker_prot.h
+++ b/frame/3/bli_l3_sup_ker_prot.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -54,3 +54,29 @@ void PASTEMAC(ch,opname) \
        cntx_t*    restrict cntx  \
      );
 
+
+
+#define TRSMSMALL_PROT( opname ) \
+\
+err_t PASTEMAC0(opname) \
+     ( \
+       side_t   side, \
+       obj_t*   alpha, \
+       obj_t*   a, \
+       obj_t*   b, \
+       cntx_t*  cntx, \
+       cntl_t*  cntl, \
+       bool     is_parallel \
+     );
+
+#define TRSMSMALL_KER_PROT( ch, opname ) \
+\
+BLIS_INLINE err_t PASTEMAC(ch,opname) \
+     ( \
+       obj_t*   AlphaObj, \
+       obj_t*   a, \
+       obj_t*   b, \
+       cntx_t*  cntx, \
+       cntl_t*  cntl \
+     );
+
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index 7c315192d5..4b04354475 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -198,9 +198,28 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
 	}
 	else
 	{
-		//bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
-		printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels.\n" );
-		bli_abort();
+		if ( ( dt == BLIS_DOUBLE ) || ( dt == BLIS_DCOMPLEX ) )
+		{
+			// The optimizations are only done for CRC and RRC storage schemes to avoid RD kernels.
+			// Optimizations for other storage schemes is yet to be done.
+			if ( packa )
+			{
+				if( *eff_id == BLIS_CRC )
+				{
+					*eff_id = BLIS_CCC;
+				}
+				else if ( *eff_id == BLIS_RRC )
+				{
+					*trans = bli_trans_toggled( *trans );
+					*eff_id = BLIS_RCC;
+				}
+			}
+		}
+		else
+		{
+			printf( "libblis: sup var1n2m_opt_cases not yet implemented for column-preferential kernels for S, C and Z datatypes.\n" );
+			bli_abort();
+		}
 	}
 }
 
diff --git a/frame/3/gemm/bli_gemm_front_amd.c b/frame/3/gemm/bli_gemm_front_amd.c
index b15d906dd8..b64baf0001 100644
--- a/frame/3/gemm/bli_gemm_front_amd.c
+++ b/frame/3/gemm/bli_gemm_front_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -132,7 +132,14 @@ void bli_gemm_front
 	// Attach alpha to B, and in the process typecast alpha to the target
 	// datatype of the matrix (which in this case is equal to the computation
 	// datatype).
-	bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
+
+	// In case of dzgemm, if the microkernel prefers column output,
+	// we will induce a transposition and perform C+= A*B 
+	// where A( formerly B) is complex. Hence attach alpha to A.
+	if ( bli_gemm_md_is_ccr( &a_local, &b_local, &c_local ))
+		bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &a_local );
+	else
+		bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
 
 	// Attach beta to C, and in the process typecast beta to the target
 	// datatype of the matrix (which in this case is equal to the storage
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index dc1c3d14dc..f91e22d435 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -408,18 +408,18 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 /* Send progress update if the user has enabled it */ \
-if(AOCL_progress_ptr) { \
+AOCL_progress_callback AOCL_progress_local_ptr = AOCL_progress_ptr; \
+if (AOCL_progress_local_ptr) { \
 	/* Running total for current thread */ \
 	tls_aoclprogress_counter += m * n * k; \
 	/* Send the update only if enough number of elements are processes */ \
 	if ((tls_aoclprogress_counter - tls_aoclprogress_last_update)  >= AOCL_PROGRESS_FREQUENCY) \
 	{ \
 		tls_aoclprogress_last_update = tls_aoclprogress_counter; \
-		AOCL_PROGRESS_DT(*MKSTR(ch), \
-						"gemm", \
-						tls_aoclprogress_counter, \
-						AOCL_gettid(), \
-						bli_rntm_num_threads(rntm)); \
+		(*AOCL_progress_local_ptr)(MKSTR(ch) "gemm", sizeof(MKSTR(ch) "gemm"), \
+								   tls_aoclprogress_counter, \
+								   AOCL_gettid(), \
+								   bli_rntm_num_threads(rntm)); \
 	}\
 } \
  \
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
index 68298c71ca..66f8414a27 100644
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -156,20 +156,92 @@ mddm_t bli_gemm_md_ccr
        cntx_t** cntx
      )
 {
+	mddm_t doms;
+
+	// We assume that the requested computation domain is complex.
+	//dom_t dom_comp_in = bli_obj_comp_domain( c );
+	//dom_t dom_comp_in = BLIS_COMPLEX;
+
+	// For ccr, the computation (ukernel) will be real, but the execution
+	// will appear complex to other parts of the implementation.
+	doms.comp = BLIS_REAL;
+	doms.exec = BLIS_COMPLEX;
+
+	// Here we construct the computation datatype, which for the ccr case
+	// is equal to the real projection of the execution datatype, and use
+	// that computation datatype to query the corresponding ukernel output
+	// preference.
+	const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
+	const bool  row_pref
+	      = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx );
+
+	// We can only perform this case of mixed-domain gemm, C += A*B where
+	// B is real, if the microkernel prefers column output. If it prefers
+	// row output, we must induce a transposition and perform C += A*B
+	// where A (formerly B) is real.
+	if ( row_pref )
+	{
+		bli_obj_swap( a, b );
+
+		bli_obj_induce_trans( a );
+		bli_obj_induce_trans( b );
+		bli_obj_induce_trans( c );
+
+		return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
+	}
+
 	// Create a local copy of the context and then prepare to use this
 	// context instead of the one passed in.
 	*cntx_local = **cntx;
 	*cntx = cntx_local;
 
-	//we must induce a transposition and perform C += A*B
-	// where A (formerly B) is real.
-	bli_obj_swap( a, b );
+	// Copy the real domain blocksizes into the slots of their complex
+	// counterparts.
+	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
+	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
+	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
+	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
+	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
 
-	bli_obj_induce_trans( a );
-	bli_obj_induce_trans( b );
-	bli_obj_induce_trans( c );
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
 
-	return bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+
+	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+
+	// Halve both the real and complex MR's (which are both real MR's).
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mr );
+
+	// Halve both the real and complex MC's (which are both real MC's).
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc );
+
+	// Use the default pack schemas in the context.
+
+	// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
+	func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
+
+	// Rather than check which complex datatype dt_comp refers to, we set
+	// the mixed-domain virtual microkernel for both types.
+	bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
+	bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
+
+	// Return the computation and execution domains.
+	return doms;
 }
 
 // -----------------------------------------------------------------------------
@@ -196,6 +268,29 @@ mddm_t bli_gemm_md_crc
 	doms.comp = BLIS_REAL;
 	doms.exec = BLIS_COMPLEX;
 
+	// Here we construct the computation datatype, which for the crc case
+	// is equal to the real projection of the execution datatype, and use
+	// that computation datatype to query the corresponding ukernel output
+	// preference.
+	const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
+	const bool  col_pref
+	      = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx );
+
+	// We can only perform this case of mixed-domain gemm, C += A*B where
+	// A is real, if the microkernel prefers row output. If it prefers
+	// column output, we must induce a transposition and perform C += A*B
+	// where B (formerly A) is real.
+	if ( col_pref )
+	{
+		bli_obj_swap( a, b );
+
+		bli_obj_induce_trans( a );
+		bli_obj_induce_trans( b );
+		bli_obj_induce_trans( c );
+
+		return bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
+	}
+
 	// Create a local copy of the context and then prepare to use this
 	// context instead of the one passed in.
 	*cntx_local = **cntx;
@@ -708,7 +803,7 @@ void bli_gemm_md_zgemm
 	}
 
 	{
-		// A sort of hack for communicating the desired pach schemas for A and B
+		// A sort of hack for communicating the desired pack schemas for A and B
 		// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 		// bli_l3_cntl_create_if()). This allows us to access the schemas from
 		// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/gemm/bli_gemm_packab.c b/frame/3/gemm/bli_gemm_packab.c
index 6828725546..098206df7d 100644
--- a/frame/3/gemm/bli_gemm_packab.c
+++ b/frame/3/gemm/bli_gemm_packab.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -49,6 +49,11 @@ void bli_gemm_packa
 	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_5);
 	obj_t a_pack;
 
+	// BY setting family id to BLIS_GEMM_MD, we indicate packing kernels
+	// to scale alpha while packing.
+	if(bli_obj_dt(c) != bli_obj_dt(b))
+		bli_cntl_set_family(BLIS_GEMM_MD, cntl);
+
 	// Pack matrix A according to the control tree node.
 	bli_l3_packm
 	(
diff --git a/frame/3/gemmt/bli_gemmt_sup_var1n2m.c b/frame/3/gemmt/bli_gemmt_sup_var1n2m.c
index a026ed8d39..982dc6e035 100644
--- a/frame/3/gemmt/bli_gemmt_sup_var1n2m.c
+++ b/frame/3/gemmt/bli_gemmt_sup_var1n2m.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -75,6 +75,9 @@ typedef void (*gemmt_ker_ft)
        cntx_t*    restrict cntx
      );
 
+// these kernels are compiled as part of haswell config
+// use them only when BLIS_KERNELS_HASWELL is defined
+#ifdef BLIS_KERNELS_HASWELL
 //Look-up table for Gemmt Upper Variant Kernels
 gemmt_ker_ft ker_fpus[14] =
 	{
@@ -94,23 +97,27 @@ gemmt_ker_ft ker_fpus[14] =
 		bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U};
 
 //Look-up table for Gemmt Lower Variant Kernels
-gemmt_ker_ft ker_fpls[14] =
-{
-	bli_dgemmsup_rv_haswell_asm_6x8m_0x0_L,
-	bli_dgemmsup_rv_haswell_asm_6x8m_6x0_L,
-	bli_dgemmsup_rv_haswell_asm_6x8m_6x8_L,
-	bli_dgemmsup_rv_haswell_asm_6x8m_12x8_L,
-	bli_dgemmsup_rv_haswell_asm_6x8m_12x16_L,
-	bli_dgemmsup_rv_haswell_asm_6x8m_18x16_L,
-	bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L,
-	bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L,
-	bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L,
-	bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L,
-	bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L,
-	bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L,
-	bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L,
-	bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
-};
+gemmt_ker_ft ker_fpls[14] = 
+	{
+		bli_dgemmsup_rv_haswell_asm_6x8m_0x0_L,
+		bli_dgemmsup_rv_haswell_asm_6x8m_6x0_L,
+		bli_dgemmsup_rv_haswell_asm_6x8m_6x8_L,
+		bli_dgemmsup_rv_haswell_asm_6x8m_12x8_L,
+		bli_dgemmsup_rv_haswell_asm_6x8m_12x16_L,
+		bli_dgemmsup_rv_haswell_asm_6x8m_18x16_L,
+		bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L,
+		bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L,
+		bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L,
+		bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L,
+		bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L,
+		bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L,
+		bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L,
+		bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
+	};
+#else
+gemmt_ker_ft ker_fpls[1];
+gemmt_ker_ft ker_fpus[1];
+#endif
 
 //
 // -- var1n --------------------------------------------------------------------
@@ -219,6 +226,12 @@ void bli_gemmtsup_ref_var1n
 		cs_b  = bli_obj_row_stride( b );
 	}
 
+
+	// Optimize some storage/packing cases by transforming them into others.
+	// These optimizations are expressed by changing trans and/or eff_id.
+	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
+
+
 	bool uploc;
 	if( bli_obj_is_lower( c ) )
 	{
@@ -242,12 +255,6 @@ void bli_gemmtsup_ref_var1n
 	// function pointer.
 	FUNCPTR_T f = ftypes_var1n[dt][uploc];
 
-#if 1
-	// Optimize some storage/packing cases by transforming them into others.
-	// These optimizations are expressed by changing trans and/or eff_id.
-	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
-#endif
-
 	if ( bli_is_notrans( trans ) )
 	{
 		// Invoke the function.
@@ -1353,8 +1360,13 @@ void bli_gemmtsup_ref_var2m
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	bool uploc;
 
+	// Optimize some storage/packing cases by transforming them into others.
+	// These optimizations are expressed by changing trans and/or eff_id.
+	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
+
+
+	bool uploc;
 	if ( bli_is_notrans ( trans ) )
 		uploc = bli_obj_is_lower( c ) ? 0 : 1;
 	else
@@ -1373,11 +1385,7 @@ void bli_gemmtsup_ref_var2m
 	// function pointer.
 	FUNCPTR_T f = ftypes_var2m[dt][uploc];
 
-#if 0
-	// Optimize some storage/packing cases by transforming them into others.
-	// These optimizations are expressed by changing trans and/or eff_id.
-	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
-#endif
+
 
 	if ( bli_is_notrans( trans ) )
 	{
@@ -1922,7 +1930,9 @@ void PASTEMACT(ch,opname,uplo,varname) \
 \
 						/* Check if m, n indices are multiple of MR and NR respectively
 						   and current block is a complete 6x8 block */ \
-						bool idx_supported = ((m_off_24 % MR) == 0) && ((n_off_24 % NR) == 0) && (mr_cur == MR) && (nr_cur == NR); \
+						bool idx_supported = ((m_off_24 % MR) == 0) && ((n_off_24 % NR) == 0)\
+						&& (MR == 6) && (NR == 8) \
+						&& (bli_cpuid_is_avx2fma3_supported() == TRUE) && (mr_cur == MR) && (nr_cur == NR); \
 \
 						/* m_idx and n_idx would be equal only if the current block is
 						   a diagonal block */\
@@ -2568,7 +2578,6 @@ void PASTEMACT(ch,opname,uplo,varname) \
 					for( dim_t i = m_rect;( i < mc_cur) && (m_off_cblock < n_off_cblock + nr_cur); i += MR ) \
 					{ \
 						const dim_t mr_cur = (i+MR-1) < mc_cur ? MR : mc_cur - i; \
-\
 						/* Prerequisites : MR = 6, NR = 8.
 						   An optimization: allow the last jr iteration to contain up to NRE
 						   In DGEMMT API implementation, kernel operates on 6x8 block. MR and
@@ -2600,7 +2609,9 @@ void PASTEMACT(ch,opname,uplo,varname) \
 \
 						/* Check if m, n indices are multiple of MR and NR respectively
 						   and current block is a complete 6x8 block */ \
-						bool idx_supported = ((m_off_24 % MR) == 0) && ((n_off_24 % NR) == 0) && (mr_cur==MR) && (nr_cur==NR); \
+						bool idx_supported = ((m_off_24 % MR) == 0) && ((n_off_24 % NR) == 0)\
+						&& (MR == 6) && (NR == 8) \
+						&& (bli_cpuid_is_avx2fma3_supported() == TRUE) && (mr_cur==MR) && (nr_cur==NR); \
 \
 						/* m_idx and n_idx would be equal only if the current block is
 						   a diagonal block */\
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
index d1746eb4eb..a9878e0f9e 100644
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -150,7 +151,7 @@ void bli_hemm_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/her2k/bli_her2k_front.c b/frame/3/her2k/bli_her2k_front.c
index 096ea463bc..39d4dfc0d6 100644
--- a/frame/3/her2k/bli_her2k_front.c
+++ b/frame/3/her2k/bli_her2k_front.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -119,7 +120,7 @@ void bli_her2k_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/herk/bli_herk_front.c b/frame/3/herk/bli_herk_front.c
index a88d23e90a..9ba19b3a36 100644
--- a/frame/3/herk/bli_herk_front.c
+++ b/frame/3/herk/bli_herk_front.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -99,7 +100,7 @@ void bli_herk_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
index 61238fb158..a395a1c1c6 100644
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -149,7 +150,7 @@ void bli_symm_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/syr2k/bli_syr2k_front.c b/frame/3/syr2k/bli_syr2k_front.c
index c1532b92d7..dfd1f575a5 100644
--- a/frame/3/syr2k/bli_syr2k_front.c
+++ b/frame/3/syr2k/bli_syr2k_front.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -100,7 +101,7 @@ void bli_syr2k_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/syrk/bli_syrk_front.c b/frame/3/syrk/bli_syrk_front.c
index 4b7c8cd75a..b0e0338a9d 100644
--- a/frame/3/syrk/bli_syrk_front.c
+++ b/frame/3/syrk/bli_syrk_front.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -94,7 +94,7 @@ void bli_syrk_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index 63fc8053f9..fd6f070fc4 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -168,7 +168,7 @@ void bli_trmm_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/trmm/bli_trmm_front_amd.c b/frame/3/trmm/bli_trmm_front_amd.c
index 2301b323a7..d564d5728f 100644
--- a/frame/3/trmm/bli_trmm_front_amd.c
+++ b/frame/3/trmm/bli_trmm_front_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -177,7 +177,7 @@ void bli_trmm_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index ba7d3a91ff..9042d1478d 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -160,7 +161,7 @@ void bli_trmm3_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
index 35cd2d4b85..07555301bb 100644
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -145,38 +145,47 @@ void bli_trsm_front
 	  rntm
 	);
 
-	// A sort of hack for communicating the desired pach schemas for A and B
+	// If TRSM and GEMM have different blocksizes and blocksizes
+	// are changed in global cntx object, when GEMM and TRSM are
+	// called in parallel, blocksizes in global cntx object will
+	// not be correct for GEMM
+	// to fix this
+	// create a local copy of cntx so that overriding the blocksizes does
+	// not impact the global cntx object.
+	cntx_t cntx_trsm = *cntx;
+
+	// A sort of hack for communicating the desired pack schemas for A and B
 	// to bli_trsm_cntl_create() (via bli_l3_thread_decorator() and
 	// bli_l3_cntl_create_if()). This allows us to access the schemas from
 	// the control tree, which hopefully reduces some confusion, particularly
 	// in bli_packm_init().
-	if ( bli_cntx_method( cntx ) == BLIS_NAT )
+	if ( bli_cntx_method( &cntx_trsm ) == BLIS_NAT )
 	{
 #if defined(BLIS_FAMILY_AMDZEN) ||  defined(BLIS_FAMILY_ZEN4) 
 		/* Zen4 TRSM Fixme:
 		 *
 		 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels 
-		 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+		 * for TRSM (Till we implement TRSM AVX-512 kernels)
 		 * 
 		 * The AVX2 kernels use different block sizes then AVX512 kernels
 		 * Here we override the default block sizes in the context with AVX2 
-		 * specific block size used in  GEMMTRSM kernerls.
+		 * specific block size used in GEMMTRSM kernerls.
 		 * 
 		 * We need to revisit this when TRSM AVX-512 kernels are implemented.
 		 */
 		if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4)  &&
-			 (bli_obj_dt(a) == BLIS_FLOAT) )
+			 ((bli_obj_dt(a) == BLIS_FLOAT) || (bli_obj_dt(a) == BLIS_DOUBLE)) )
 		{
-			bli_zen4_override_trsm_blkszs(cntx);
+			bli_zen4_override_trsm_blkszs(&cntx_trsm);
 		}
 #endif
 		bli_obj_set_pack_schema( BLIS_PACKED_ROW_PANELS, &a_local );
 		bli_obj_set_pack_schema( BLIS_PACKED_COL_PANELS, &b_local );
 	}
-	else // if ( bli_cntx_method( cntx ) != BLIS_NAT )
+	else // if ( bli_cntx_method( cntx_trsm ) != BLIS_NAT )
 	{
-		pack_t schema_a = bli_cntx_schema_a_block( cntx );
-		pack_t schema_b = bli_cntx_schema_b_panel( cntx );
+		pack_t schema_a = bli_cntx_schema_a_block( &cntx_trsm );
+		pack_t schema_b = bli_cntx_schema_b_panel( &cntx_trsm );
 
 		bli_obj_set_pack_schema( schema_a, &a_local );
 		bli_obj_set_pack_schema( schema_b, &b_local );
@@ -192,24 +201,11 @@ void bli_trsm_front
 	  &b_local,
 	  alpha,
 	  &c_local,
-	  cntx,
+	  &cntx_trsm,
 	  rntm,
 	  cntl
 	);
-	
-#if defined(BLIS_FAMILY_AMDZEN) ||  defined(BLIS_FAMILY_ZEN4) 
-		/* Zen4 TRSM Fixme:
-		 *
-		 * We have overrding the block sizes at the start of this function
-		 * Since the context is created only once we need to ensure that the 
-		 * default block sizes are restored for the subsequent operations.
-		 */
-		if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4)  &&
-			 (bli_obj_dt(a) == BLIS_FLOAT) )
-		{
-			bli_zen4_restore_default_blkszs(cntx);
-		}
-#endif
+
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
 }
 
diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h
index 379935536a..1f0c2ca205 100644
--- a/frame/3/trsm/bli_trsm_front.h
+++ b/frame/3/trsm/bli_trsm_front.h
@@ -52,7 +52,8 @@ err_t bli_trsm_small
        obj_t*  a,
        obj_t*  b,
        cntx_t* cntx,
-       cntl_t* cntl
+       cntl_t* cntl,
+       bool is_parallel
      );
 #endif
 
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index a15f39fc3c..75d5241f55 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	/* Zen4 TRSM Fixme:
 	 *
 	 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
-	 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+	 * for TRSM (Till we implement TRSM AVX-512 kernels)
 	 *
 	 * The AVX2 kernels for TRSM are enabled in the context, but they
 	 * are compatible with only AVX2 version of GEMM kernels.
@@ -188,10 +188,11 @@ void PASTEMAC(ch,varname) \
 	 *
 	 * We need to revisit this when TRSM AVX-512 kernels are implemented.
 	 */ \
-	if ((bli_arch_query_id() == BLIS_ARCH_ZEN4)  && \
-		(dt == BLIS_FLOAT)) \
+	bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	if (bli_arch_query_id() == BLIS_ARCH_ZEN4 && ((dt == BLIS_FLOAT) || (dt == BLIS_DOUBLE)) ) \
 	{ \
-		gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
+		gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \
+		col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \
 	} \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
@@ -201,7 +202,6 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 48e4588f52..27cea4dc3c 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	/* Zen4 TRSM Fixme:
 	 *
 	 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
-	 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+	 * for TRSM (Till we implement TRSM AVX-512 kernels)
 	 *
 	 * The AVX2 kernels for TRSM are enabled in the context, but they
 	 * are compatible with only AVX2 version of GEMM kernels.
@@ -188,10 +188,11 @@ void PASTEMAC(ch,varname) \
 	 *
 	 * We need to revisit this when TRSM AVX-512 kernels are implemented.
 	 */ \
-	if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
-		(dt == BLIS_FLOAT)) \
+	bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	if (bli_arch_query_id() == BLIS_ARCH_ZEN4 && ((dt == BLIS_FLOAT) || (dt == BLIS_DOUBLE)) ) \
 	{ \
-		gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
+		gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \
+		col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \
 	} \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
@@ -201,7 +202,6 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 2705a747ac..7c57438a2d 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	/* Zen4 TRSM Fixme:
 	 *
 	 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
-	 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+	 * for TRSM (Till we implement TRSM AVX-512 kernels)
 	 *
 	 * The AVX2 kernels for TRSM are enabled in the context, but they
 	 * are compatible with only AVX2 version of GEMM kernels.
@@ -194,10 +194,11 @@ void PASTEMAC(ch,varname) \
 	 *
 	 * We need to revisit this when TRSM AVX-512 kernels are implemented.
 	 */ \
-	if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
-		(dt == BLIS_FLOAT)) \
+	bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	if (bli_arch_query_id() == BLIS_ARCH_ZEN4 && ((dt == BLIS_FLOAT) || (dt == BLIS_DOUBLE)) ) \
 	{ \
-		gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
+		gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \
+		col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \
 	} \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
@@ -207,7 +208,6 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index dc37614eb6..766a6b95c1 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	/* Zen4 TRSM Fixme:
 	 *
 	 * On Zen4 we want to use AVX-512 kernels for GEMM and AVX2 kernels
-	 * for TRSM (Till we implemente TRSM AVX-512 kernels)
+	 * for TRSM (Till we implement TRSM AVX-512 kernels)
 	 *
 	 * The AVX2 kernels for TRSM are enabled in the context, but they
 	 * are compatible with only AVX2 version of GEMM kernels.
@@ -193,10 +193,11 @@ void PASTEMAC(ch,varname) \
 	 *
 	 * We need to revisit this when TRSM AVX-512 kernels are implemented.
 	 */ \
-	if ((bli_arch_query_id() == BLIS_ARCH_ZEN4) && \
-		(dt == BLIS_FLOAT)) \
+	bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	if (bli_arch_query_id() == BLIS_ARCH_ZEN4 && ((dt == BLIS_FLOAT) || (dt == BLIS_DOUBLE)) ) \
 	{ \
-		gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_AVX2_UKR, cntx ); \
+		gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \
+		col_pref = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_FOR_TRSM_UKR, cntx ); \
 	} \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
@@ -206,7 +207,6 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c
index 8d2f8689a9..fe63fb9f91 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -56,6 +56,7 @@ void bli_trsm_xx_ker_var2
 	dim_t        side;
 	dim_t        uplo;
 	trsm_var_oft f;
+	AOCL_progress_callback AOCL_progress_local_ptr = AOCL_progress_ptr;
 
 	// Set two bools: one based on the implied side parameter (the structure
 	// of the root object) and one based on the uplo field of the triangular
@@ -88,10 +89,9 @@ void bli_trsm_xx_ker_var2
 	  thread
 	);
 
-	// Send progress update if enabled
-	if (AOCL_progress_ptr)
+	/* Send progress update if the user has enabled it */
+	if (AOCL_progress_local_ptr)
 	{
-
 		// Get the size of block processed in
 		// this iteration, add it to the accumulated
 		// total and send the update.
@@ -100,13 +100,14 @@ void bli_trsm_xx_ker_var2
 		dim_t k = bli_obj_width(a);
 
 		num_t dt = bli_obj_dt(c);
-		char dt_c;
+		char *dt_api = NULL;
+		dim_t dt_api_len = 5;
 
 		// Running total for current thread.
 		tls_aoclprogress_counter += m * n * k;
 
 		// Send the update only if number of elements processes so far
-		// has exceeded the freqency of reporting. 
+		// has exceeded the freqency of reporting.
 		if ((tls_aoclprogress_counter - tls_aoclprogress_last_update) >=
 			 AOCL_PROGRESS_FREQUENCY)
 		{
@@ -117,26 +118,26 @@ void bli_trsm_xx_ker_var2
 			switch (dt)
 			{
 			case BLIS_FLOAT:
-				dt_c = 's';
+				dt_api = "strsm";
 				break;
 			case BLIS_DOUBLE:
-				dt_c = 'd';
+				dt_api = "dtrsm";
 				break;
 			case BLIS_SCOMPLEX:
-				dt_c = 'c';
+				dt_api = "ctrsm";
 				break;
 			case BLIS_DCOMPLEX:
-				dt_c = 'z';
+				dt_api = "ztrsm";
 				break;
 			default:
-				dt_c = ' ';
+				dt_api = " trsm";
 			}
 
-			AOCL_PROGRESS_DT(dt_c,
-			                 "trsm",
-			                 tls_aoclprogress_counter,
-			                 AOCL_gettid(),
-			                 bli_rntm_num_threads(rntm));
+			(*AOCL_progress_local_ptr)(dt_api,
+			                           dt_api_len,
+			                           tls_aoclprogress_counter,
+			                           AOCL_gettid(),
+			                           bli_rntm_num_threads(rntm));
 		}
 	}
 
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index fecc353161..6ae245afbc 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -47,28 +47,65 @@
 
 // -----------------------------------------------------------------------------
 
-// The arch_t id for the currently running hardware. We initialize to -1,
-// which will be overwritten upon calling bli_arch_set_id().
-static arch_t id = -1;
+// The arch and model ids for the currently running hardware. We initialize
+// to -1, which will be overwritten upon calling bli_arch_set_id().
+static arch_t actual_arch_id = -1;
+static model_t actual_model_id = -1;
+
+// The arch and model ids for the currently running hardware, or the values
+// the user specifies to use. We initialize to -1, which will be overwritten
+// upon calling bli_arch_set_id().
+static arch_t arch_id = -1;
+static model_t model_id = -1;
+
+// Variable used to communicate if user has set '__blis_arch_type_name' between
+// bli_arch_set_id() and bli_arch_check_id()
+static dim_t __attribute__ ((unused)) req_id = -1;
 
 arch_t bli_arch_query_id( void )
 {
 	bli_arch_set_id_once();
+	bli_arch_check_id_once();
 
 	// Simply return the id that was previously cached.
-	return id;
+	return arch_id;
+}
+
+model_t bli_model_query_id( void )
+{
+	bli_arch_set_id_once();
+	bli_arch_check_id_once();
+
+	// Simply return the model_id that was previously cached.
+	return model_id;
+}
+
+model_t bli_init_model_query_id( void )
+{
+	bli_arch_set_id_once();
+
+	// Simply return the model_id that was previously cached.
+	return model_id;
 }
 
 // -----------------------------------------------------------------------------
 
 // A pthread structure used in pthread_once(). pthread_once() is guaranteed to
 // execute exactly once among all threads that pass in this control object.
-static bli_pthread_once_t once_id = BLIS_PTHREAD_ONCE_INIT;
+static bli_pthread_once_t once_id_init = BLIS_PTHREAD_ONCE_INIT;
+static bli_pthread_once_t once_id_check = BLIS_PTHREAD_ONCE_INIT;
 
 void bli_arch_set_id_once( void )
 {
 #ifndef BLIS_CONFIGURETIME_CPUID
-	bli_pthread_once( &once_id, bli_arch_set_id );
+	bli_pthread_once( &once_id_init, bli_arch_set_id );
+#endif
+}
+
+void bli_arch_check_id_once( void )
+{
+#ifndef BLIS_CONFIGURETIME_CPUID
+	bli_pthread_once( &once_id_check, bli_arch_check_id );
 #endif
 }
 
@@ -81,6 +118,10 @@ void bli_arch_set_id( void )
 	bool do_logging = bli_env_get_var( "BLIS_ARCH_DEBUG", 0 );
 	bli_arch_set_logging( do_logging );
 
+	// Get actual hardware arch and model ids.
+	actual_arch_id = bli_cpuid_query_id();
+	actual_model_id = bli_cpuid_query_model_id( actual_arch_id );
+
 	// DISABLE_BLIS_ARCH_TYPE and BLIS_CONFIGURETIME_CPUID seem similar but
 	// have different use cases:
 	// * BLIS_CONFIGURETIME_CPUID is used by the "configure auto" option to
@@ -93,7 +134,7 @@ void bli_arch_set_id( void )
 	// defined to be) to see if the user requested that we use a specific
 	// subconfiguration. "__blis_arch_type_name" will be defined by the
 	// configure command in bli_config.h, with the default name of BLIS_ARCH_TYPE
-	dim_t req_id = bli_env_get_var_arch_type( __blis_arch_type_name, -1 );
+	req_id = bli_env_get_var_arch_type( __blis_arch_type_name, -1 );
 
 #ifndef BLIS_CONFIGURETIME_CPUID
 	if ( req_id != -1 )
@@ -101,31 +142,18 @@ void bli_arch_set_id( void )
 		// BLIS_ARCH_TYPE was set. Cautiously check whether its value is usable.
 
 		// If req_id was set to an invalid arch_t value (ie: outside the range
-		// [0,BLIS_NUM_ARCHS-1]), output an error message and abort.
+		// [1,BLIS_NUM_ARCHS-1]), output an error message and abort.
 		if ( bli_error_checking_is_enabled() )
 		{
 			err_t e_val = bli_check_valid_arch_id( req_id );
 			bli_check_error_code( e_val );
 		}
 
-		// At this point, we know that req_id is in the valid range, but we
-		// don't yet know if it refers to a context that was actually
-		// initialized. Query the address of an internal context data structure
-		// corresponding to req_id. This pointer will be NULL if the associated
-		// subconfig is not available.
-		cntx_t** req_cntx = bli_gks_lookup_id( req_id );
-
-		// This function checks the context pointer and aborts with a useful
-		// error message if the pointer is found to be NULL.
-		if ( bli_error_checking_is_enabled() )
-		{
-			err_t e_val = bli_check_initialized_gks_cntx( req_cntx );
-			bli_check_error_code( e_val );
-		}
+		// Check again context actually initialized deferred to
+		// bli_arch_check_id() called later.
 
-		// Finally, we can be confident that req_id (1) is in range and (2)
-		// refers to a context that has been initialized.
-		id = req_id;
+		// For now, we can only be confident that req_id is in range.
+		arch_id = req_id;
 	}
 	else
 #endif
@@ -138,101 +166,210 @@ void bli_arch_set_id( void )
 		// Architecture families.
 		#if defined BLIS_FAMILY_INTEL64      || \
 		    defined BLIS_FAMILY_AMDZEN       || \
-			defined BLIS_FAMILY_AMD64_LEGACY || \
+		    defined BLIS_FAMILY_AMD64_LEGACY || \
 		    defined BLIS_FAMILY_X86_64       || \
 		    defined BLIS_FAMILY_ARM64        || \
 		    defined BLIS_FAMILY_ARM32
-		id = bli_cpuid_query_id();
+		arch_id = actual_arch_id;
 		#endif
 
 		// Intel microarchitectures.
 		#ifdef BLIS_FAMILY_SKX
-		id = BLIS_ARCH_SKX;
+		arch_id = BLIS_ARCH_SKX;
 		#endif
 		#ifdef BLIS_FAMILY_KNL
-		id = BLIS_ARCH_KNL;
+		arch_id = BLIS_ARCH_KNL;
 		#endif
 		#ifdef BLIS_FAMILY_KNC
-		id = BLIS_ARCH_KNC;
+		arch_id = BLIS_ARCH_KNC;
 		#endif
 		#ifdef BLIS_FAMILY_HASWELL
-		id = BLIS_ARCH_HASWELL;
+		arch_id = BLIS_ARCH_HASWELL;
 		#endif
 		#ifdef BLIS_FAMILY_SANDYBRIDGE
-		id = BLIS_ARCH_SANDYBRIDGE;
+		arch_id = BLIS_ARCH_SANDYBRIDGE;
 		#endif
 		#ifdef BLIS_FAMILY_PENRYN
-		id = BLIS_ARCH_PENRYN;
+		arch_id = BLIS_ARCH_PENRYN;
 		#endif
 
 		// AMD microarchitectures.
 		#ifdef BLIS_FAMILY_ZEN4
-		id = BLIS_ARCH_ZEN4;
+		arch_id = BLIS_ARCH_ZEN4;
 		#endif
 		#ifdef BLIS_FAMILY_ZEN3
-		id = BLIS_ARCH_ZEN3;
+		arch_id = BLIS_ARCH_ZEN3;
 		#endif
 		#ifdef BLIS_FAMILY_ZEN2
-		id = BLIS_ARCH_ZEN2;
+		arch_id = BLIS_ARCH_ZEN2;
 		#endif
 		#ifdef BLIS_FAMILY_ZEN
-		id = BLIS_ARCH_ZEN;
+		arch_id = BLIS_ARCH_ZEN;
 		#endif
 		#ifdef BLIS_FAMILY_EXCAVATOR
-		id = BLIS_ARCH_EXCAVATOR;
+		arch_id = BLIS_ARCH_EXCAVATOR;
 		#endif
 		#ifdef BLIS_FAMILY_STEAMROLLER
-		id = BLIS_ARCH_STEAMROLLER;
+		arch_id = BLIS_ARCH_STEAMROLLER;
 		#endif
 		#ifdef BLIS_FAMILY_PILEDRIVER
-		id = BLIS_ARCH_PILEDRIVER;
+		arch_id = BLIS_ARCH_PILEDRIVER;
 		#endif
 		#ifdef BLIS_FAMILY_BULLDOZER
-		id = BLIS_ARCH_BULLDOZER;
+		arch_id = BLIS_ARCH_BULLDOZER;
 		#endif
 
 		// ARM microarchitectures.
 		#ifdef BLIS_FAMILY_THUNDERX2
-		id = BLIS_ARCH_THUNDERX2;
+		arch_id = BLIS_ARCH_THUNDERX2;
 		#endif
 		#ifdef BLIS_FAMILY_CORTEXA57
-		id = BLIS_ARCH_CORTEXA57;
+		arch_id = BLIS_ARCH_CORTEXA57;
 		#endif
 		#ifdef BLIS_FAMILY_CORTEXA53
-		id = BLIS_ARCH_CORTEXA53;
+		arch_id = BLIS_ARCH_CORTEXA53;
 		#endif
 		#ifdef BLIS_FAMILY_CORTEXA15
-		id = BLIS_ARCH_CORTEXA15;
+		arch_id = BLIS_ARCH_CORTEXA15;
 		#endif
 		#ifdef BLIS_FAMILY_CORTEXA9
-		id = BLIS_ARCH_CORTEXA9;
+		arch_id = BLIS_ARCH_CORTEXA9;
 		#endif
 
 		// IBM microarchitectures.
 		#ifdef BLIS_FAMILY_POWER10
-		id = BLIS_ARCH_POWER10;
+		arch_id = BLIS_ARCH_POWER10;
 		#endif
 		#ifdef BLIS_FAMILY_POWER9
-		id = BLIS_ARCH_POWER9;
+		arch_id = BLIS_ARCH_POWER9;
 		#endif
 		#ifdef BLIS_FAMILY_POWER7
-		id = BLIS_ARCH_POWER7;
+		arch_id = BLIS_ARCH_POWER7;
 		#endif
 		#ifdef BLIS_FAMILY_BGQ
-		id = BLIS_ARCH_BGQ;
+		arch_id = BLIS_ARCH_BGQ;
 		#endif
 
 		// Generic microarchitecture.
 		#ifdef BLIS_FAMILY_GENERIC
-		id = BLIS_ARCH_GENERIC;
+		arch_id = BLIS_ARCH_GENERIC;
 		#endif
 	}
 
+
+#ifndef DISABLE_BLIS_MODEL_TYPE
+	// Check the environment variable (that "__blis_model_type_name" is
+	// defined to be) to see if the user requested that we use a specific
+	// subconfiguration. "__blis_model_type_name" will be defined by the
+	// configure command in bli_config.h, with the default name of BLIS_MODEL_TYPE
+	dim_t req_model = bli_env_get_var_model_type( __blis_model_type_name, -1 );
+
+#ifndef BLIS_CONFIGURETIME_CPUID
+	if ( req_model != -1 )
+	{
+		// BLIS_MODEL_TYPE was set. Cautiously check whether its value is usable.
+		// Assume here that arch_id is valid.
+
+		// If req_model was set to an invalid model_t value (ie: both outside
+		// the range appropriate for the given architecture and not default),
+		// set to default value and continue.
+		if ( bli_error_checking_is_enabled() )
+		{
+			err_t e_val = bli_check_valid_model_id( arch_id, req_model );
+			if (e_val != BLIS_SUCCESS)
+			{
+				req_model = BLIS_MODEL_DEFAULT;
+				e_val = BLIS_SUCCESS;
+			}
+			bli_check_error_code( e_val );
+		}
+
+		// We can now be confident that req_model is in range for the
+		// selected architecture, or it has been reset to be default.
+		model_id = req_model;
+	}
+	else
+#endif
+
+#endif
+	{
+		// BLIS_MODEL_TYPE was unset. Proceed with normal subconfiguration
+		// selection behavior, based on value of architecture id selected
+		// above. Unlike for arch_id, we cannot simply use actual_model_id
+		// here, as we need to choose model_id based on the arch_id we are
+		// using, which could be different to actual_arch_id.
+
+		model_id = bli_cpuid_query_model_id( arch_id );
+	}
+
+	//printf( "blis_arch_query_id(): arch_id, model_id = %u, %u\n", arch_id, model_id );
+	//exit(1);
+}
+
+void bli_arch_check_id( void )
+{
+	bli_arch_set_id_once();
+
+	// Check arch value against configured options. Only needed
+	// if user has set it. This function will also do the
+	// logging of chosen arch and model (if desired).
+
+	// DISABLE_BLIS_ARCH_TYPE and BLIS_CONFIGURETIME_CPUID seem similar but
+	// have different use cases:
+	// * BLIS_CONFIGURETIME_CPUID is used by the "configure auto" option to
+	//   select a single code path, and affects other parts of the code.
+	// * DISABLE_BLIS_ARCH_TYPE disables user selection of code path here in
+	//   builds with multiple code paths.
+
+#ifndef DISABLE_BLIS_ARCH_TYPE
+
+#ifndef BLIS_CONFIGURETIME_CPUID
+	if ( req_id != -1 )
+	{
+		// BLIS_ARCH_TYPE was set. Cautiously check whether its value is usable.
+
+		// In BLAS1 and BLAS2 routines, bli_init_auto() may not have been
+		// called, so ensure cntx has been initialized here.
+		bli_gks_init_once();
+
+		// At this point, we know that req_id is in the valid range, but we
+		// don't yet know if it refers to a context that was actually
+		// initialized. Query the address of an internal context data structure
+		// corresponding to req_id. This pointer will be NULL if the associated
+		// subconfig is not available.
+		cntx_t** req_cntx = bli_gks_lookup_id( req_id );
+
+		// This function checks the context pointer and aborts with a useful
+		// error message if the pointer is found to be NULL.
+		if ( bli_error_checking_is_enabled() )
+		{
+			err_t e_val = bli_check_initialized_gks_cntx( req_cntx );
+			bli_check_error_code( e_val );
+		}
+
+		// Finally, we can be confident that req_id (1) is in range and (2)
+		// refers to a context that has been initialized.
+		arch_id = req_id;
+	}
+#endif
+
+#endif
+
 	if ( bli_arch_get_logging() )
-		fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n",
-				 bli_arch_string( id ) );
+        {
+		if ( model_id == BLIS_MODEL_DEFAULT )
+		{
+			fprintf( stderr, "libblis: selecting sub-configuration '%s'.\n",
+				 bli_arch_string( arch_id ) );
+		}
+		else
+		{
+			fprintf( stderr, "libblis: selecting sub-configuration '%s', model '%s'.\n",
+				 bli_arch_string( arch_id ), bli_model_string( model_id ) );
+		}
+        }
 
-	//printf( "blis_arch_query_id(): id = %u\n", id );
+	//printf( "blis_arch_check_id(): arch_id, model_id = %u, %u\n", arch_id, model_id );
 	//exit(1);
 }
 
@@ -285,6 +422,33 @@ char* bli_arch_string( arch_t id )
 	return config_name[ id ];
 }
 
+// NOTE: This string array must be kept up-to-date with the model_t
+// enumeration that is typedef'ed in bli_type_defs.h. That is, the
+// index order of each string should correspond to the implied/assigned
+// enum value given to the corresponding BLIS_model_ value.
+// This must also be kept up-to-date with the bli_env_get_var_model_type()
+// function in bli_env.c
+static char* model_name[ BLIS_NUM_MODELS ] =
+{
+    "error",
+
+    "default",
+
+    "Genoa",
+    "Bergamo",
+    "Genoa-X",
+
+    "Milan",
+    "Milan-X",
+
+    "default"
+};
+
+char* bli_model_string( model_t id )
+{
+	return model_name[ id ];
+}
+
 // -----------------------------------------------------------------------------
 
 static bool arch_dolog = 0;
diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h
index 0cd55dace3..b36c669fd5 100644
--- a/frame/base/bli_arch.h
+++ b/frame/base/bli_arch.h
@@ -40,11 +40,19 @@ BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void );
 void bli_arch_set_id_once( void );
 void bli_arch_set_id( void );
 
+void bli_arch_check_id_once( void );
+void bli_arch_check_id( void );
+
 BLIS_EXPORT_BLIS char*  bli_arch_string( arch_t id );
 
 void bli_arch_set_logging( bool dolog );
 bool bli_arch_get_logging( void );
 void bli_arch_log( char*, ... );
 
+BLIS_EXPORT_BLIS model_t bli_model_query_id( void );
+BLIS_EXPORT_BLIS model_t bli_init_model_query_id( void );
+
+BLIS_EXPORT_BLIS char*  bli_model_string( model_t id );
+
 #endif
 
diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c
index 78d139e6b2..6f6914b4f8 100644
--- a/frame/base/bli_check.c
+++ b/frame/base/bli_check.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -885,12 +885,44 @@ err_t bli_check_valid_arch_id( arch_t id )
 {
 	err_t e_val = BLIS_SUCCESS;
 
-	if ( ( gint_t )id < 0 || BLIS_NUM_ARCHS <= ( gint_t )id )
+	if ( ( gint_t )id <= 0 || BLIS_NUM_ARCHS <= ( gint_t )id )
 		e_val = BLIS_INVALID_ARCH_ID;
 
 	return e_val;
 }
 
+
+err_t bli_check_valid_model_id( arch_t arch_id, model_t model_id )
+{
+	// We have to check model_id is OK for the specific arch_id.
+	// Model ranges are specified in bli_type_defs.h
+	err_t e_val = BLIS_INVALID_MODEL_ID;
+
+	if ( arch_id == BLIS_ARCH_ZEN4 )
+	{
+		if ( ( gint_t )model_id >= BLIS_MODEL_GENOA &&
+		     ( gint_t )model_id <= BLIS_MODEL_GENOA_X )
+		{
+			e_val = BLIS_SUCCESS;
+		}
+	}
+	if ( arch_id == BLIS_ARCH_ZEN3 )
+	{
+		if ( ( gint_t )model_id >= BLIS_MODEL_MILAN &&
+		     ( gint_t )model_id <= BLIS_MODEL_MILAN_X )
+		{
+			e_val = BLIS_SUCCESS;
+		}
+	}
+	// model = default is OK for all microarchitectures.
+	if ( model_id == BLIS_MODEL_DEFAULT )
+	{
+		e_val = BLIS_SUCCESS;
+	}
+
+	return e_val;
+}
+
 err_t bli_check_initialized_gks_cntx( cntx_t** cntx )
 {
 	err_t e_val = BLIS_SUCCESS;
diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h
index 70ec2fd8f0..2481d12298 100644
--- a/frame/base/bli_check.h
+++ b/frame/base/bli_check.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -110,6 +110,7 @@ err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size );
 err_t bli_check_object_alias_of( obj_t* a, obj_t* b );
 
 err_t bli_check_valid_arch_id( arch_t id );
+err_t bli_check_valid_model_id( arch_t arch_id, model_t model_id );
 err_t bli_check_initialized_gks_cntx( cntx_t** cntx );
 
 err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr );
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 8dab2a5a19..1c47b9e583 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -6,7 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -623,6 +623,49 @@ BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of_md( obj_t* obj, num_t d
 }
 
 // -----------------------------------------------------------------------------
+
+
+BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
+{
+	const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
+
+	// A ukernel preference of TRUE means the ukernel prefers row storage.
+	return ( bool )
+	       ( prefs == TRUE );
+}
+
+BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
+{
+	const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
+
+	// A ukernel preference of FALSE means the ukernel prefers column storage.
+	return ( bool )
+	       ( prefs == FALSE );
+}
+
+BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
+{
+	const num_t dt    = bli_obj_dt( obj );
+	const bool  ukr_prefers_rows
+	                  = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+	const bool  ukr_prefers_cols
+	                  = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx );
+	bool        r_val = FALSE;
+
+	if      ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
+	else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
+
+	return r_val;
+}
+
+BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
+{
+	return ( bool )
+	       !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx );
+}
+
+// -----------------------------------------------------------------------------
+
 BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx )
 {
 	num_t dt = bli_obj_dt( c );
@@ -630,7 +673,12 @@ BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( obj_t* a, obj_t* b, obj_t* c, cn
 
 	dim_t m, n;
 
-	if(bli_cntx_l3_vir_ukr_dislikes_storage_of(c, BLIS_GEMM_UKR, cntx ) )
+	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
+
+	// The SUP kernel storage preference should be used to determine
+	// m and n. This ensures right thresholds are check even if native
+	// kernel storage preference is different.
+	if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) )
 	{
 		m = bli_obj_width(c);
 		n = bli_obj_length(c);
@@ -673,54 +721,6 @@ BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( obj_t* a, obj_t* b, obj_t* c, cn
 	return FALSE;
 }
 
-// -----------------------------------------------------------------------------
-
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
-
-	// A ukernel preference of TRUE means the ukernel prefers row storage.
-	return ( bool )
-	       ( prefs == TRUE );
-}
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
-
-	// A ukernel preference of FALSE means the ukernel prefers column storage.
-	return ( bool )
-	       ( prefs == FALSE );
-}
-
-#if 0
-// NOTE: These static functions aren't needed yet.
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
-{
-	const num_t dt    = bli_obj_dt( obj );
-	const bool  ukr_prefers_rows
-	                  = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
-	const bool  ukr_prefers_cols
-	                  = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx );
-	bool        r_val = FALSE;
-
-	if      ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
-	else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
-
-	return r_val;
-}
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
-{
-	return ( bool )
-	       !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx );
-}
-#endif
-
-// -----------------------------------------------------------------------------
-
 //
 // -- cntx_t modification (complex) --------------------------------------------
 //
diff --git a/frame/base/bli_const.h b/frame/base/bli_const.h
index 781b56cb8e..c05b8fcdf4 100644
--- a/frame/base/bli_const.h
+++ b/frame/base/bli_const.h
@@ -44,4 +44,4 @@ void bli_const_finalize( void );
 enum mulfactor {  BLIS_MUL_MINUS_ONE = -1,
                   BLIS_MUL_ZERO,
                   BLIS_MUL_ONE,
-                  BLIS_MUL_DEFAULT  };
\ No newline at end of file
+                  BLIS_MUL_DEFAULT  };
diff --git a/frame/base/bli_cpuid.c b/frame/base/bli_cpuid.c
index 2796fadc05..91bd6d8e6d 100644
--- a/frame/base/bli_cpuid.c
+++ b/frame/base/bli_cpuid.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2019, Dave Love, University of Manchester
 
    Redistribution and use in source and binary forms, with or without
@@ -64,6 +64,24 @@
 
 #include "cpuid.h"
 
+// Variables to return if different AVX instructions are supported. These will
+// be updated in bli_cpuid_query_id(), which is called only once from
+// bli_arch_set_id(), using the pthread_once mechanism. This assume that the
+// cpuid information will not change once the library is loaded.
+// Cached values can be returned by the specific functions for each below.
+
+static bool is_avx2fma3_supported = FALSE;
+static bool is_avx512_supported = FALSE;
+static bool is_avx512vnni_supported = FALSE;
+static bool is_avx512bf16_supported = FALSE;
+
+// Variables to store the cache sizes (in KB). L3 size is shared by all
+// logical processors in the package (i.e. per socket).
+static uint32_t bli_l1d_cache_size = -1;
+static uint32_t bli_l1i_cache_size = -1;
+static uint32_t bli_l2_cache_size = -1;
+static uint32_t bli_l3_cache_size = -1;
+
 arch_t bli_cpuid_query_id( void )
 {
 	uint32_t vendor, family, model, features;
@@ -73,18 +91,48 @@ arch_t bli_cpuid_query_id( void )
 	// vendor.
 	vendor = bli_cpuid_query( &family, &model, &features );
 
+	if ( vendor == VENDOR_INTEL || vendor == VENDOR_AMD )
+	{
+		// Check different levels of AVX instruction support.
+		bli_cpuid_check_avx2fma3_support( family, model, features );
+		bli_cpuid_check_avx512_support( family, model, features );
+		bli_cpuid_check_avx512vnni_support( family, model, features );
+		bli_cpuid_check_avx512bf16_support( family, model, features );
+
+		// Find out cache sizes and set in static variables.
+		// Currently only enabled for VENDOR_AMD.
+		bli_cpuid_check_cache( vendor );
+	}
+
 #if 0
 	printf( "vendor   = %s\n", vendor==1 ? "AMD": "INTEL" );
-	printf("family    = %x\n", family );
-	printf( "model    = %x\n", model );
-
-	printf( "features = %x\n", features );
+	printf( "family   = %x h\n", family );
+	printf( "model    = %x h\n", model );
+
+	printf( "features = %x h\n", features );
+	printf( "AVX2/FMA3            = %d\n", is_avx2fma3_supported );
+	printf( "AVX512 F/DQ/CD/BW/VL = %d\n", is_avx512_supported );
+	printf( "AVX512 VNNI          = %d\n", is_avx512vnni_supported );
+	printf( "AVX512 BF16          = %d\n", is_avx512bf16_supported );
+
+	printf( "Cache Information:\n" );
+	printf( "L1I size = %u KB\n",bli_l1i_cache_size );
+	printf( "L1D size = %u KB\n",bli_l1d_cache_size );
+	printf( "L2  size = %u KB\n",bli_l2_cache_size );
+	printf( "L3  size = %u KB\n",bli_l3_cache_size );
 #endif
 
 	if ( vendor == VENDOR_INTEL )
 	{
 		// Check for each Intel configuration that is enabled, check for that
 		// microarchitecture. We check from most recent to most dated.
+#ifdef BLIS_CONFIG_ZEN4
+		// Even if not optimized for Intel processors, this should
+		// generally perform better than skx codepath.
+		// Currently only enabled for zen4 and amdzen configurations
+		if ( is_avx512_supported )
+			return BLIS_ARCH_ZEN4;
+#endif
 #ifdef BLIS_CONFIG_SKX
 		if ( bli_cpuid_is_skx( family, model, features ) )
 			return BLIS_ARCH_SKX;
@@ -93,6 +141,13 @@ arch_t bli_cpuid_query_id( void )
 		if ( bli_cpuid_is_knl( family, model, features ) )
 			return BLIS_ARCH_KNL;
 #endif
+#ifdef BLIS_CONFIG_ZEN3
+		// Even if not optimized for Intel processors, this should
+		// generally perform better than haswell codepath.
+		// Currently only enabled for zen3 and amdzen configurations
+		if ( is_avx2fma3_supported )
+			return BLIS_ARCH_ZEN3;
+#endif
 #ifdef BLIS_CONFIG_HASWELL
 		if ( bli_cpuid_is_haswell( family, model, features ) )
 			return BLIS_ARCH_HASWELL;
@@ -117,6 +172,9 @@ arch_t bli_cpuid_query_id( void )
 #ifdef BLIS_CONFIG_ZEN4
 		if ( bli_cpuid_is_zen4( family, model, features ) )
 			return BLIS_ARCH_ZEN4;
+		// Fallback test for future AMD processors
+		if ( is_avx512_supported )
+			return BLIS_ARCH_ZEN4;
 #endif
 #ifdef BLIS_CONFIG_ZEN3
 		if ( bli_cpuid_is_zen3( family, model, features ) )
@@ -158,6 +216,47 @@ arch_t bli_cpuid_query_id( void )
 	return BLIS_ARCH_GENERIC;
 }
 
+model_t bli_cpuid_query_model_id( arch_t arch_id )
+{
+	// Set default for architectures where separate models haven't been defined.
+	model_t cpuid_model = BLIS_MODEL_DEFAULT;
+
+#ifdef BLIS_CONFIG_ZEN4
+	if (arch_id == BLIS_ARCH_ZEN4)
+	{
+		// Call the CPUID instruction and parse its results into a family id,
+		// model id, and a feature bit field. The return value encodes the
+		// vendor.
+
+		uint32_t __attribute__ ((unused)) vendor;
+		uint32_t family, model, features;
+
+		vendor = bli_cpuid_query( &family, &model, &features );
+
+		// Check CPU model.
+		cpuid_model = bli_cpuid_get_zen4_cpuid_model( family, model, features );
+	}
+#endif
+#ifdef BLIS_CONFIG_ZEN3
+	if (arch_id == BLIS_ARCH_ZEN3)
+	{
+		// Call the CPUID instruction and parse its results into a family id,
+		// model id, and a feature bit field. The return value encodes the
+		// vendor.
+
+		uint32_t __attribute__ ((unused)) vendor;
+		uint32_t family, model, features;
+
+		vendor = bli_cpuid_query( &family, &model, &features );
+
+		// Check CPU model.
+		cpuid_model = bli_cpuid_get_zen3_cpuid_model( family, model, features );
+	}
+#endif
+
+	return cpuid_model;
+}
+
 // -----------------------------------------------------------------------------
 
 bool bli_cpuid_is_skx
@@ -276,19 +375,20 @@ bool bli_cpuid_is_zen4
      )
 {
 	// Check for expected CPU features.
-	const uint32_t expected =   FEATURE_SSE3     |
-                                FEATURE_SSSE3    |
-                                FEATURE_SSE41    |
-                                FEATURE_SSE42    |
-                                FEATURE_AVX      |
-                                FEATURE_AVX2     |
-                                FEATURE_FMA3     |
-                                FEATURE_AVX512F  |
-                                FEATURE_AVX512DQ |
-                                FEATURE_AVX512CD |
-                                FEATURE_AVX512BW |
-                                FEATURE_AVX512VL |
-                                FEATURE_AVX512VNNI;
+	const uint32_t expected = FEATURE_SSE3       |
+	                          FEATURE_SSSE3      |
+	                          FEATURE_SSE41      |
+	                          FEATURE_SSE42      |
+	                          FEATURE_AVX        |
+	                          FEATURE_FMA3       |
+	                          FEATURE_AVX2       |
+	                          FEATURE_AVX512F    |
+	                          FEATURE_AVX512DQ   |
+	                          FEATURE_AVX512CD   |
+	                          FEATURE_AVX512BW   |
+	                          FEATURE_AVX512VL   |
+	                          FEATURE_AVX512VNNI |
+	                          FEATURE_AVX512BF16;
 
 	if ( !bli_cpuid_has_features( features, expected ) ) return FALSE;
 
@@ -302,6 +402,26 @@ bool bli_cpuid_is_zen4
 
 	return TRUE;
 }
+model_t bli_cpuid_get_zen4_cpuid_model
+    (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+    )
+{
+	// Look at model of CPU and set cpuid_model appropriately.
+	// For Zen4, the default is Genoa.
+	model_t cpuid_model = BLIS_MODEL_GENOA;
+
+	if ( family == 0x19 )
+	{
+		if ( 0xA0 <= model && model <= 0xAf ) // Bergamo
+		{
+			cpuid_model = BLIS_MODEL_BERGAMO;
+		}
+	}
+	return cpuid_model;
+}
 
 bool bli_cpuid_is_zen3
      (
@@ -337,6 +457,30 @@ bool bli_cpuid_is_zen3
 
 	return TRUE;
 }
+model_t bli_cpuid_get_zen3_cpuid_model
+    (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+    )
+{
+	// Look at model of CPU and set cpuid_model appropriately.
+	// For Zen3, the default is Milan.
+	model_t cpuid_model = BLIS_MODEL_MILAN;
+
+	if ( family == 0x19 )
+	{
+		if ( model <= 0x0f ) // EPYC and ThreadRipper
+		{
+			uint32_t l3_cache_size = bli_cpuid_query_l3_cache_size();
+			if ( l3_cache_size == 786432 )
+			{
+				cpuid_model = BLIS_MODEL_MILAN_X;
+			}
+		}
+	}
+	return cpuid_model;
+}
 
 bool bli_cpuid_is_zen2
      (
@@ -503,85 +647,77 @@ bool bli_cpuid_is_bulldozer
 	return TRUE;
 }
 
-// Check (at runtime) if AVX is supported on the current platform, this is to 
-// ensure that AVX kernels are not used on legacy platforms which results in crash
-
-// The support for AVX is checked only once (when this API is called first time)
-// On subsequent calls the cached value is returned. This is achieved using 
-// pthread_once mechanism since this information does not change once the library
-// is loaded.
-static bool is_avx_supported = FALSE;
-
 
-// Determine if the CPU has support for AVX.
-void bli_cpuid_check_avx_support( void )
+// Determine if the CPU has support for AVX2 and FMA3.
+void bli_cpuid_check_avx2fma3_support
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
 {
-	uint32_t family, model, features;
-
-	// Call the CPUID instruction and parse its results into a family id,
-	// model id, and a feature bit field.
-	bli_cpuid_query( &family, &model, &features );
-
 	// Check for expected CPU features.
 	const uint32_t expected = FEATURE_AVX     |
 	                          FEATURE_FMA3    |
 	                          FEATURE_AVX2;
 
-	if ( !bli_cpuid_has_features( features, expected ) ) 
+	if ( !bli_cpuid_has_features( features, expected ) )
 	{
-		is_avx_supported = FALSE;
+		is_avx2fma3_supported = FALSE;
 	}
-	else 
+	else
 	{
-		is_avx_supported = TRUE;
+		is_avx2fma3_supported = TRUE;
 	}
 }
 
-static bli_pthread_once_t once_check_avx_support = BLIS_PTHREAD_ONCE_INIT;
-
-// Ensure that actual support determincation happens only once
-void bli_cpuid_check_avx_support_once( void )
-{
-#ifndef BLIS_CONFIGURETIME_CPUID
-	bli_pthread_once( &once_check_avx_support,  bli_cpuid_check_avx_support );
-#endif
-}
 
-// API to check if AVX is supported or not on the current platform.
-bool bli_cpuid_is_avx_supported( void )
+// Determine if the CPU has support for AVX512.
+void bli_cpuid_check_avx512_support
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
 {
-	bli_cpuid_check_avx_support_once();
+	// Check for expected CPU features.
+	const uint32_t expected = FEATURE_AVX      |
+	                          FEATURE_FMA3     |
+	                          FEATURE_AVX2     |
+	                          FEATURE_AVX512F  |
+	                          FEATURE_AVX512DQ |
+	                          FEATURE_AVX512CD |
+	                          FEATURE_AVX512BW |
+	                          FEATURE_AVX512VL;
 
-	return is_avx_supported;
+	if ( !bli_cpuid_has_features( features, expected ) )
+	{
+		is_avx512_supported = FALSE;
+	}
+	else
+	{
+		is_avx512_supported = TRUE;
+	}
 }
 
-
-// Check (at runtime) if AVX512_VNNI is supported on the current platform, this
-// is to ensure that AVX512_VNNI kernels are not used on legacy platforms which
-// results in crash.
-
-// The support for AVX512_VNNI is checked only once (when this API is called
-// first time). On subsequent calls the cached value is returned.
-static bool is_avx512vnni_supported = FALSE;
-
 // Determine if the CPU has support for AVX512_VNNI.
-void bli_cpuid_check_avx512vnni_support( void )
+void bli_cpuid_check_avx512vnni_support
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
 {
-	uint32_t family, model, features;
-
-	// Call the CPUID instruction and parse its results into a family id,
-	// model id, and a feature bit field.
-	bli_cpuid_query( &family, &model, &features );
-
 	// Check for expected CPU features.
-	const uint32_t expected =	FEATURE_AVX        |
-								FEATURE_FMA3       |
-								FEATURE_AVX2       |
-								FEATURE_AVX512F    |
-								FEATURE_AVX512DQ   |
-								FEATURE_AVX512BW   |
-								FEATURE_AVX512VL   |
-								FEATURE_AVX512VNNI;
+	const uint32_t expected = FEATURE_AVX        |
+	                          FEATURE_FMA3       |
+	                          FEATURE_AVX2       |
+	                          FEATURE_AVX512F    |
+	                          FEATURE_AVX512DQ   |
+	                          FEATURE_AVX512CD   |
+	                          FEATURE_AVX512BW   |
+	                          FEATURE_AVX512VL   |
+	                          FEATURE_AVX512VNNI;
 
 	if ( !bli_cpuid_has_features( features, expected ) )
 	{
@@ -593,30 +729,25 @@ void bli_cpuid_check_avx512vnni_support( void )
 	}
 }
 
-// The support for AVX512_BF16 is checked only once (when this API is called
-// first time). On subsequent calls the cached value is returned.
-static bool is_avx512bf16_supported = FALSE;
-
 // Determine if the CPU has support for AVX512_BF16.
-void bli_cpuid_check_avx512_bf16_support( void )
+void bli_cpuid_check_avx512bf16_support
+     (
+       uint32_t family,
+       uint32_t model,
+       uint32_t features
+     )
 {
-	uint32_t family, model, features;
-
-	// Call the CPUID instruction and parse its results into a family id,
-	// model id, and a feature bit field.
-	bli_cpuid_query( &family, &model, &features );
-
 	// Check for expected CPU features.
-	const uint32_t expected =	FEATURE_AVX        |
-								FEATURE_FMA3       |
-								FEATURE_AVX2       |
-								FEATURE_AVX512F    |
-								FEATURE_AVX512DQ   |
-								FEATURE_AVX512BW   |
-								FEATURE_AVX512VL   |
-								FEATURE_AVX512VNNI |
-								FEATURE_AVX512BF16
-								;
+	const uint32_t expected = FEATURE_AVX        |
+	                          FEATURE_FMA3       |
+	                          FEATURE_AVX2       |
+	                          FEATURE_AVX512F    |
+	                          FEATURE_AVX512DQ   |
+	                          FEATURE_AVX512CD   |
+	                          FEATURE_AVX512BW   |
+	                          FEATURE_AVX512VL   |
+	                          FEATURE_AVX512VNNI |
+	                          FEATURE_AVX512BF16;
 
 	if ( !bli_cpuid_has_features( features, expected ) )
 	{
@@ -628,41 +759,76 @@ void bli_cpuid_check_avx512_bf16_support( void )
 	}
 }
 
-static bli_pthread_once_t once_check_avx512vnni_support = BLIS_PTHREAD_ONCE_INIT;
-static bli_pthread_once_t once_check_avx512_bf16_support = BLIS_PTHREAD_ONCE_INIT;
 
-// Ensure that actual support determination happens only once
-void bli_cpuid_check_avx512vnni_support_once( void )
+// Ensure that actual support determination happens only once from AVX
+// support routines below.
+
+static bli_pthread_once_t once_check_cpuid_query_id = BLIS_PTHREAD_ONCE_INIT;
+
+void bli_cpuid_query_id_wrapper( void )
+{
+	arch_t __attribute__ ((unused)) id_w;
+        id_w = bli_cpuid_query_id();
+}
+void bli_cpuid_query_id_once( void )
 {
 #ifndef BLIS_CONFIGURETIME_CPUID
-	bli_pthread_once( &once_check_avx512vnni_support,  bli_cpuid_check_avx512vnni_support );
+	bli_pthread_once( &once_check_cpuid_query_id, bli_cpuid_query_id_wrapper );
 #endif
 }
 
-// Ensure that actual support determination happens only once to avoid performance hit
-void bli_cpuid_check_avx512_bf16_support_once( void )
+// API to check if AVX2 and FMA3 are supported or not on the current platform.
+bool bli_cpuid_is_avx2fma3_supported( void )
 {
-#ifndef BLIS_CONFIGURETIME_CPUID
-	bli_pthread_once( &once_check_avx512_bf16_support,  bli_cpuid_check_avx512_bf16_support );
-#endif
+	bli_cpuid_query_id_once();
+	return is_avx2fma3_supported;
+}
+
+// API to check if AVX512 is supported or not on the current platform.
+bool bli_cpuid_is_avx512_supported( void )
+{
+	bli_cpuid_query_id_once();
+	return is_avx512_supported;
 }
 
 // API to check if AVX512_VNNI is supported or not on the current platform.
 bool bli_cpuid_is_avx512vnni_supported( void )
 {
-	bli_cpuid_check_avx512vnni_support_once();
-
+	bli_cpuid_query_id_once();
 	return is_avx512vnni_supported;
 }
 
 // API to check if AVX512_bf16 is supported or not on the current platform.
-bool bli_cpuid_is_avx512_bf16_supported( void )
+bool bli_cpuid_is_avx512bf16_supported( void )
 {
-	bli_cpuid_check_avx512_bf16_support_once();
-
+	bli_cpuid_query_id_once();
 	return is_avx512bf16_supported;
 }
 
+uint32_t bli_cpuid_query_l1d_cache_size( void )
+{
+	bli_cpuid_query_id_once();
+	return bli_l1d_cache_size;
+}
+
+uint32_t bli_cpuid_query_l1i_cache_size( void )
+{
+	bli_cpuid_query_id_once();
+	return bli_l1i_cache_size;
+}
+
+uint32_t bli_cpuid_query_l2_cache_size( void )
+{
+	bli_cpuid_query_id_once();
+	return bli_l2_cache_size;
+}
+
+uint32_t bli_cpuid_query_l3_cache_size( void )
+{
+	bli_cpuid_query_id_once();
+	return bli_l3_cache_size;
+}
+
 #elif defined(__aarch64__) || defined(__arm__) || defined(_M_ARM)
 
 arch_t bli_cpuid_query_id( void )
@@ -815,7 +981,7 @@ bool bli_cpuid_is_cortexa9
 
    Copyright (C) 2017, The University of Texas at Austin
    Copyright (C) 2017, Devin Matthews
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -1130,6 +1296,22 @@ uint32_t bli_cpuid_query
 		return VENDOR_UNKNOWN;
 }
 
+void bli_cpuid_check_cache( uint32_t vendor )
+{
+        if ( vendor == VENDOR_AMD )
+	{
+		uint32_t eax, ebx, ecx, edx;
+
+		__cpuid( 0x80000005, eax, ebx, ecx, edx );
+		bli_l1d_cache_size = (ecx >> 24) & 0xFF;
+		bli_l1i_cache_size = (edx >> 24) & 0xFF;
+
+		__cpuid( 0x80000006, eax, ebx, ecx, edx );
+		bli_l2_cache_size = (ecx >> 16) & 0xFFFF;
+		bli_l3_cache_size = ((edx >> 18) & 0x3FFF) * 512;
+	}
+}
+
 void get_cpu_name( char *cpu_name )
 {
 	uint32_t eax, ebx, ecx, edx;
diff --git a/frame/base/bli_cpuid.h b/frame/base/bli_cpuid.h
index 805f31bf2e..c34bb37485 100644
--- a/frame/base/bli_cpuid.h
+++ b/frame/base/bli_cpuid.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -53,6 +53,13 @@
 
 arch_t bli_cpuid_query_id( void );
 
+model_t bli_cpuid_query_model_id( arch_t id );
+
+uint32_t bli_cpuid_query_l1d_cache_size( void );
+uint32_t bli_cpuid_query_l1i_cache_size( void );
+uint32_t bli_cpuid_query_l2_cache_size( void );
+uint32_t bli_cpuid_query_l3_cache_size( void );
+
 // Intel
 bool bli_cpuid_is_skx( uint32_t family, uint32_t model, uint32_t features );
 bool bli_cpuid_is_knl( uint32_t family, uint32_t model, uint32_t features );
@@ -62,6 +69,7 @@ bool bli_cpuid_is_penryn( uint32_t family, uint32_t model, uint32_t features );
 
 // AMD
 bool bli_cpuid_is_zen4( uint32_t family, uint32_t model, uint32_t features );
+bool bli_cpuid_is_avx512_fallback( uint32_t family, uint32_t model, uint32_t features );
 bool bli_cpuid_is_zen3( uint32_t family, uint32_t model, uint32_t features );
 bool bli_cpuid_is_zen2( uint32_t family, uint32_t model, uint32_t features );
 bool bli_cpuid_is_zen( uint32_t family, uint32_t model, uint32_t features );
@@ -70,6 +78,9 @@ bool bli_cpuid_is_steamroller( uint32_t family, uint32_t model, uint32_t feature
 bool bli_cpuid_is_piledriver( uint32_t family, uint32_t model, uint32_t features );
 bool bli_cpuid_is_bulldozer( uint32_t family, uint32_t model, uint32_t features );
 
+model_t bli_cpuid_get_zen4_cpuid_model( uint32_t family, uint32_t model, uint32_t features );
+model_t bli_cpuid_get_zen3_cpuid_model( uint32_t family, uint32_t model, uint32_t features );
+
 // ARM
 bool bli_cpuid_is_thunderx2( uint32_t model, uint32_t part, uint32_t features );
 bool bli_cpuid_is_cortexa57( uint32_t model, uint32_t part, uint32_t features );
@@ -79,6 +90,8 @@ bool bli_cpuid_is_cortexa9( uint32_t model, uint32_t part, uint32_t features );
 
 uint32_t bli_cpuid_query( uint32_t* family, uint32_t* model, uint32_t* features );
 
+void bli_cpuid_check_cache( uint32_t vendor );
+
 // -----------------------------------------------------------------------------
 
 //
@@ -133,9 +146,16 @@ BLIS_INLINE bool bli_cpuid_has_features( uint32_t have, uint32_t want )
 
 void get_cpu_name( char *cpu_name );
 int  vpu_count( void );
-bool bli_cpuid_is_avx_supported(void);
+
+bool bli_cpuid_is_avx2fma3_supported(void);
+bool bli_cpuid_is_avx512_supported(void);
 bool bli_cpuid_is_avx512vnni_supported(void);
-bool bli_cpuid_is_avx512_bf16_supported(void);
+bool bli_cpuid_is_avx512bf16_supported(void);
+
+void bli_cpuid_check_avx2fma3_support( uint32_t family, uint32_t model, uint32_t features );
+void bli_cpuid_check_avx512_support( uint32_t family, uint32_t model, uint32_t features );
+void bli_cpuid_check_avx512vnni_support( uint32_t family, uint32_t model, uint32_t features );
+void bli_cpuid_check_avx512bf16_support( uint32_t family, uint32_t model, uint32_t features );
 
 enum
 {
diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c
index 7fabc2b955..2585a16ce7 100644
--- a/frame/base/bli_env.c
+++ b/frame/base/bli_env.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -220,6 +220,84 @@ gint_t bli_env_get_var_arch_type( const char* env, gint_t fallback )
 	return r_val;
 }
 
+gint_t bli_env_get_var_model_type( const char* env, gint_t fallback )
+{
+	gint_t r_val;
+	char*  str;
+	int i, size;
+
+	// Query the environment variable and store the result in str.
+	str = getenv( env );
+
+	// Set the return value based on the string obtained from getenv().
+	if ( str != NULL )
+	{
+		// If there was no error, convert the string to an integer and
+		// prepare to return that integer.
+		r_val = ( gint_t )strtol( str, NULL, 10 );
+
+		if (r_val == 0)
+		{
+			// Could be deliberately 0 (meaning an ERROR)
+			// or a non-numeric value. We still allow direct
+			// specification of integer value to select code
+			// path. Non-zero integer values bypass this code
+			// block and are handled as before. Here we look
+			// for known meaningful names, and return 0 if
+			// we cannot find a match. This code MUST be kept
+			// in synch with arch_t enumeration in
+			// bli_type_defs.h and array config_name in bli_arch.c
+
+			// convert string to lowercase
+			size = strlen(str);
+			for (i=0;i<=size;i++)
+			{
+				str[i] = tolower(str[i]);
+			}
+			// AMD
+			if (strcmp(str, "genoa") == 0)
+			{
+				r_val = BLIS_MODEL_GENOA;
+			}
+			else if (strcmp(str, "bergamo") == 0)
+			{
+				r_val = BLIS_MODEL_BERGAMO;
+			}
+			else if ((strcmp(str, "genoa_x") == 0) ||
+			         (strcmp(str, "genoa-x") == 0) ||
+			         (strcmp(str, "genoax") == 0))
+			{
+				r_val = BLIS_MODEL_GENOA_X;
+			}
+			else if (strcmp(str, "milan") == 0)
+			{
+				r_val = BLIS_MODEL_MILAN;
+			}
+			else if ((strcmp(str, "milan_x") == 0) ||
+			         (strcmp(str, "milan-x") == 0) ||
+			         (strcmp(str, "milanx") == 0))
+			{
+				r_val = BLIS_MODEL_MILAN_X;
+			}
+			// Default (all architectures)
+			else if (strcmp(str, "default") == 0)
+			{
+				r_val = BLIS_MODEL_DEFAULT;
+			}
+
+			// No else case means we return r_val=0, i.e. this behaves
+			// the same as generic bli_env_get_var().
+		}
+	}
+	else
+	{
+		// If there was an error, use the "fallback" as the return value.
+		r_val = fallback;
+	}
+
+	return r_val;
+}
+
 #if 0
 #ifdef _MSC_VER
 #define strerror_r(errno,buf,len) strerror_s(buf,len,errno)
diff --git a/frame/base/bli_env.h b/frame/base/bli_env.h
index eaa778cd20..ac537a8e70 100644
--- a/frame/base/bli_env.h
+++ b/frame/base/bli_env.h
@@ -6,7 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,6 +41,7 @@ gint_t bli_env_get_var( const char* env, gint_t fallback );
 //void  bli_env_set_var( const char* env, dim_t value );
 
 gint_t bli_env_get_var_arch_type( const char* env, gint_t fallback );
+gint_t bli_env_get_var_model_type( const char* env, gint_t fallback );
 
 #endif
 
diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c
index 1381afef0e..06b1467a83 100644
--- a/frame/base/bli_error.c
+++ b/frame/base/bli_error.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -103,8 +103,9 @@ static char bli_error_string[BLIS_MAX_NUM_ERR_MSGS][BLIS_MAX_ERR_MSG_LENGTH] =
 
 	[-BLIS_EXPECTED_OBJECT_ALIAS]                = "Expected object to be alias.",
 
-	[-BLIS_INVALID_ARCH_ID]                      = "Invalid architecture id value.",
-	[-BLIS_UNINITIALIZED_GKS_CNTX]               = "Accessed uninitialized context in gks; BLIS_ARCH_TYPE is probably set to an invalid architecture id.",
+	[-BLIS_INVALID_ARCH_ID]                      = "Invalid architecture id value (env var "__blis_arch_type_name").",
+	[-BLIS_INVALID_MODEL_ID]                     = "Invalid architecture model id value (env var "__blis_model_type_name").",
+	[-BLIS_UNINITIALIZED_GKS_CNTX]               = "Accessed uninitialized context in gks; "__blis_arch_type_name" or "__blis_model_type_name" is probably set to an invalid architecture id.",
 
 	[-BLIS_MC_DEF_NONMULTIPLE_OF_MR]             = "Default MC is non-multiple of MR for one or more datatypes.",
 	[-BLIS_MC_MAX_NONMULTIPLE_OF_MR]             = "Maximum MC is non-multiple of MR for one or more datatypes.",
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index acb36d306f..56eb556977 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2021, Advanced Micro Devices, Inc.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -54,6 +54,16 @@ typedef void (*ind_cntx_init_ft)( ind_t method, num_t dt, cntx_t* cntx );
 
 // -----------------------------------------------------------------------------
 
+// A pthread_once_t variable is a pthread structure used in pthread_once().
+// pthread_once() is guaranteed to execute exactly once among all threads that
+// pass in this control object.
+static bli_pthread_once_t gks_once_init = BLIS_PTHREAD_ONCE_INIT;
+
+void bli_gks_init_once( void )
+{
+	bli_pthread_once( &gks_once_init, bli_gks_init );
+}
+
 void bli_gks_init( void )
 {
 	{
@@ -383,7 +393,7 @@ void bli_gks_register_cntx
 
 	// At this point, we know the pointer to the array of cntx_t* is NULL and
 	// needs to be allocated. Allocate the memory and initialize it to
-	// zeros/NULL, storing the address of the alloacted memory at the element
+	// zeros/NULL, storing the address of the allocated memory at the element
 	// for the current architecture id.
 	gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS );
 
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index 33a9f16c95..14fab6c8ce 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -36,6 +37,7 @@
 #define BLIS_GKS_H
 
 void    bli_gks_init( void );
+void    bli_gks_init_once( void );
 void    bli_gks_finalize( void );
 
 BLIS_EXPORT_BLIS void    bli_gks_init_index( void );
diff --git a/frame/base/bli_init.c b/frame/base/bli_init.c
index b037fbd217..a616993d5f 100644
--- a/frame/base/bli_init.c
+++ b/frame/base/bli_init.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,10 +40,14 @@
 void bli_init( void )
 {
 	bli_init_once();
+	// Always update thread-local rntm from environment as threading values
+	// may have changed from any previous calls.
+	bli_thread_update_tl();
 }
 
 void bli_finalize( void )
 {
+	bli_thread_finalize_tl();
 	bli_finalize_once();
 }
 
@@ -52,6 +56,9 @@ void bli_finalize( void )
 void bli_init_auto( void )
 {
 	bli_init_once();
+	// Always update thread-local rntm from environment as threading values
+	// may have changed from any previous calls.
+	bli_thread_update_tl();
 }
 
 void bli_finalize_auto( void )
@@ -65,6 +72,7 @@ void bli_finalize_auto( void )
 
 #else
 
+	bli_thread_finalize_tl();
 	bli_finalize_once();
 
 #endif
@@ -77,7 +85,7 @@ void bli_init_apis( void )
 	/* Initialize DTL Library with trace level set by the user */
 	AOCL_DTL_INITIALIZE(AOCL_DTL_TRACE_LEVEL);
 	// Initialize various sub-APIs.
-	bli_gks_init();
+	bli_gks_init_once();
 	bli_ind_init();
 	bli_thread_init();
 	bli_pack_init();
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index c6d2cf5b4a..98131623d8 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,6 +39,9 @@
 // along with a few other key parameters.
 rntm_t global_rntm;
 
+// Make thread settings local to each thread calling BLIS routines
+BLIS_THREAD_LOCAL rntm_t tl_rntm = BLIS_RNTM_INITIALIZER;
+
 // A mutex to allow synchronous access to global_rntm.
 bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
 
@@ -46,38 +49,43 @@ bli_pthread_mutex_t global_rntm_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
 
 void bli_rntm_init_from_global( rntm_t* rntm )
 {
-	// We must ensure that global_rntm has been initialized.
+	// Initializes supplied rntm from a combination of global and
+	// thread local data (global_rntm and tl_rntm respectively).
+
+	dim_t jc, pc, ic, jr, ir;
+
+	// We must ensure that global_rntm has been initialized
 	bli_init_once();
 
-	// Fetch the number of threads based on the order of precedence,
-	// or the latest value of number of threads,
-	// if set by the Application using omp_set_num_threads(nt) API.
-#ifdef BLIS_ENABLE_OPENMP
-	dim_t n_threads = omp_get_max_threads();
-#endif
+	// We must also ensure that tl_rntm has been updated.
+	bli_thread_update_tl();
 
 	// Acquire the mutex protecting global_rntm.
 	bli_pthread_mutex_lock( &global_rntm_mutex );
 
-	// If BLIS_NUM_THREADS environment variable is not set or
-	// if bli_thread_set_num_threads() API is not used by the
-	// application, blis_mt flag will be false.
-	// Then we derive number of threads using OpenMP API
-	// omp_get_max_threads(), and update into global rntm structure,
-	// before copying into local rntm structure.
-
-	// This updated value will be used in the subsequent parallel regions.
-	if(!(global_rntm.blis_mt))
-	{
-#ifdef BLIS_ENABLE_OPENMP
-	    global_rntm.num_threads = n_threads;
-#endif
-	}
-
+	// Initialize supplied rntm from global_rntm.
 	*rntm = global_rntm;
 
 	// Release the mutex protecting global_rntm.
 	bli_pthread_mutex_unlock( &global_rntm_mutex );
+
+	// Now update threading info in supplied rntm from tl_rntm
+	bli_rntm_set_auto_factor_only( tl_rntm.auto_factor, rntm );
+	bli_rntm_set_num_threads_only( tl_rntm.num_threads, rntm );
+
+	jc = bli_rntm_jc_ways( &tl_rntm );
+	pc = bli_rntm_pc_ways( &tl_rntm );
+	ic = bli_rntm_ic_ways( &tl_rntm );
+	jr = bli_rntm_jr_ways( &tl_rntm );
+	ir = bli_rntm_ir_ways( &tl_rntm );
+	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+
+	bli_rntm_set_blis_mt_only( tl_rntm.blis_mt, rntm );
+
+#if 0
+	printf( "bli_rntm_init_from_global()\n" );
+	bli_rntm_print( rntm );
+#endif
 }
 
 // -----------------------------------------------------------------------------
@@ -96,9 +104,9 @@ void bli_rntm_set_ways_for_op
 	// kind of information is already stored in the rntm_t object.
 	bli_rntm_set_ways_from_rntm( m, n, k, rntm );
 
-#if 0
-printf( "bli_rntm_set_ways_for_op()\n" );
-bli_rntm_print( rntm );
+#ifdef PRINT_THREADING
+	printf( "bli_rntm_set_ways_for_op()\n" );
+	bli_rntm_print( rntm );
 #endif
 
 	// Now modify the number of ways, if necessary, based on the operation.
@@ -211,8 +219,9 @@ void bli_rntm_set_ways_from_rntm
 	// First, we establish whether or not the number of threads is set.
 	if ( nt > 0 ) nt_set = TRUE;
 
-	// Take this opportunity to set the auto_factor field.
-	if ( nt_set ) auto_factor = TRUE;
+	// Take this opportunity to set the auto_factor field (when using
+	// more than one thread).
+	if ( nt_set && nt > 1 ) auto_factor = TRUE;
 
 	// Next, we establish whether or not any of the ways of parallelism
 	// for each loop were set. If any of the ways are set (positive), we
@@ -300,6 +309,11 @@ void bli_rntm_set_ways_from_rntm
 	bli_rntm_set_auto_factor_only( auto_factor, rntm );
 	bli_rntm_set_num_threads_only( nt, rntm );
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+
+#ifdef PRINT_THREADING
+	printf( "bli_rntm_set_ways_from_rntm()\n" );
+	bli_rntm_print( rntm );
+#endif
 }
 
 void bli_rntm_set_ways_from_rntm_sup
@@ -337,8 +351,9 @@ void bli_rntm_set_ways_from_rntm_sup
 	// First, we establish whether or not the number of threads is set.
 	if ( nt > 0 ) nt_set = TRUE;
 
-	// Take this opportunity to set the auto_factor field.
-	if ( nt_set ) auto_factor = TRUE;
+	// Take this opportunity to set the auto_factor field (when using
+	// more than one thread).
+	if ( nt_set && nt > 1 ) auto_factor = TRUE;
 
 	// Next, we establish whether or not any of the ways of parallelism
 	// for each loop were set. If any of the ways are set (positive), we
@@ -435,6 +450,11 @@ void bli_rntm_set_ways_from_rntm_sup
 	bli_rntm_set_auto_factor_only( auto_factor, rntm );
 	bli_rntm_set_num_threads_only( nt, rntm );
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+
+#ifdef PRINT_THREADING
+	printf( "bli_rntm_set_ways_from_rntm_sup()\n" );
+	bli_rntm_print( rntm );
+#endif
 }
 
 void bli_rntm_print
@@ -446,14 +466,16 @@ void bli_rntm_print
 
 	dim_t nt = bli_rntm_num_threads( rntm );
 
+	bool mt = bli_rntm_blis_mt( rntm );
+
 	dim_t jc = bli_rntm_jc_ways( rntm );
 	dim_t pc = bli_rntm_pc_ways( rntm );
 	dim_t ic = bli_rntm_ic_ways( rntm );
 	dim_t jr = bli_rntm_jr_ways( rntm );
 	dim_t ir = bli_rntm_ir_ways( rntm );
 
-	printf( "rntm contents	nt  jc  pc  ic  jr  ir\n" );
-	printf( "autofac? %1d | %4d%4d%4d%4d%4d%4d\n", (int)af,
+	printf( "rntm contents	       |   nt  jc  pc  ic  jr  ir\n" );
+	printf( "autofac, blis_mt? %1d, %1d | %4d%4d%4d%4d%4d%4d\n", (int)af, (int)mt,
 							   (int)nt, (int)jc, (int)pc,
 							   (int)ic, (int)jr, (int)ir );
 }
@@ -524,10 +546,10 @@ dim_t bli_rntm_calc_num_threads_in
 
 
 #ifdef AOCL_DYNAMIC
-//calculates the optimum number of threads using m, n, k dimensions.
-//This function modifies only the local copy of rntm with optimum threads.
-//Global rntm will remain unchanged. As a result, num_threads set by
-//application is available in global_rntm data structure.
+// Calculates the optimum number of threads using m, n, k dimensions.
+// This function modifies only the local copy of rntm with optimum threads.
+// tl_rntm will remain unchanged. As a result, num_threads set by
+// application is available in tl_rntm data structure.
 
 void bli_nthreads_optimum(
 				   obj_t*  a,
@@ -553,78 +575,647 @@ void bli_nthreads_optimum(
 		dim_t n = bli_obj_width(c);
 		dim_t k = bli_obj_width_after_trans(a);
 
-		if( k >= 128)
-		{
-			if(n <= 15)
-			{
-				if(m < 128) 	 n_threads_ideal = 8;
-				else if(m < 256) n_threads_ideal = 16;
-				else if(m < 512) n_threads_ideal = 32;
-				else 			 n_threads_ideal = 64;
-			}else if (n <= 64)
-			{
-				if(m < 128) 	 n_threads_ideal = 16;
-				else if(m < 256) n_threads_ideal = 32;
-				else 			 n_threads_ideal = 64;
-			}else{
-				if(m < 256) n_threads_ideal = 32;
-				else 		n_threads_ideal = 64;
-            }
-		}
-		else
+		if(bli_arch_query_id() == BLIS_ARCH_ZEN4)
 		{
-			if(m > 10000)
-			{
-				// current logic is only limiting threads to
-				// less or equal to 64 - limits performance.
-				// To deal with larger matrix sizes we need to use
-				// large number of threads to improve performance
-				// Need to derive this upperTH - and
-				// if matrix -sizes are larger and user wants
-				// to use higher number of threads - that should be allowed.
-
-				// if (n > UpperTH) n_threads_ideal = n_threads;
-				if (n > 200 )	    n_threads_ideal = 64;
-				else if ( n > 120 ) n_threads_ideal = 32;
-				else if ( n > 40  ) n_threads_ideal = 16;
-				else if ( n > 10  ) n_threads_ideal = 8;
-				else 				n_threads_ideal = 4;
-			}
-			else if( m > 1000)
+			if(n < m)
 			{
-				if (n <= 10) 		  n_threads_ideal = 4;
-				else if ( n <= 512 )  n_threads_ideal = 8;
-				else if ( n <= 1024 ) n_threads_ideal = 16;
-				else if ( n <= 2048 ) n_threads_ideal = 32;
-				else 				  n_threads_ideal = 64;
+				if(k <= 32)
+				{
+					if( m <= 1000 )
+					{
+						n_threads_ideal = 8;
+					}
+					else if( m <= 10000)
+					{
+						if( n <= 500 )
+						{
+							n_threads_ideal = 16;
+						}
+						else if( n <= 1000 )
+						{
+							n_threads_ideal = 64;
+						}
+						else
+						{
+							n_threads_ideal = 96;
+						}
+					}
+					else
+					{
+						n_threads_ideal = 96;
+					}
+				}
+				else if(k <= 64)
+				{
+					if( (m <= 100) || (m <= 500 && n <= 100))
+					{
+						n_threads_ideal = 8;
+					}
+					else if(m <= 500)
+					{
+						n_threads_ideal = 16;
+					}
+					else if(m <= 1000)
+					{
+						if(n <= 50)
+						{
+							n_threads_ideal = 8;
+						}
+						else if(n <= 250)
+						{
+							n_threads_ideal = 16;
+						}
+						else
+						{
+							n_threads_ideal = 24;
+						}
+					}
+					else if(m <= 10000)
+					{
+						if(n <= 500)
+						{
+							n_threads_ideal = 24;
+						}
+						else if(n <= 1000)
+						{
+							n_threads_ideal = 64;
+						}
+					}
+					else if( m <= 20000 && n <= 500)
+					{
+						n_threads_ideal = 96;
+					}
+					else if( m <= 30000)
+					{
+						if(n <= 1000)
+						{
+							n_threads_ideal = 144;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( m <= 40000 && n <= 1000)
+					{
+						n_threads_ideal = 168;
+					}
+					else
+					{
+						n_threads_ideal = 192;
+					}
+				}
+				else if(k <= 128)
+				{
+					if( (m <= 100) || (m <= 500 && n <= 50))
+					{
+						n_threads_ideal = 8;
+					}
+					else if(m <= 500)
+					{
+						if(n <= 100)
+						{
+							n_threads_ideal = 16;
+						}
+						else
+						{
+							n_threads_ideal = 24;
+						}
+					}
+					else if( m <= 1000 )
+					{
+						if(n <= 200)
+						{
+							n_threads_ideal = 24;
+						}
+						else
+						{
+							n_threads_ideal = 48;
+						}
+					}
+					else if( m <= 10000 )
+					{
+						if(n <= 50)
+						{
+							n_threads_ideal = 32;
+						}
+						else if(n <= 500)
+						{
+							n_threads_ideal = 48;
+						}
+						else if(n <= 750)
+						{
+							n_threads_ideal = 96;
+						}
+						else if(n <= 1000)
+						{
+							n_threads_ideal = 128;
+						}
+						else if(n <= 5000)
+						{
+							n_threads_ideal = 144;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( m <= 30000 )
+					{
+						if(n <= 1000)
+						{
+							n_threads_ideal = 168;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( m <= 40000 )
+					{
+						if(n <= 600)
+						{
+							n_threads_ideal = 144;
+						}
+						else if(n <= 1000)
+						{
+							n_threads_ideal = 168;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else
+					{
+						n_threads_ideal = 192;
+					}
+				}
+				else
+				{
+					if( m <= 100 )
+					{
+						n_threads_ideal = 8;
+					}
+					else if( m <= 500 )
+					{
+						if( n <= 50 )
+						{
+							n_threads_ideal = 16;
+						}
+						else if( n <= 200 )
+						{
+							n_threads_ideal = 32;
+						}
+						else
+						{
+							n_threads_ideal = 48;
+						}
+					}
+					else if( m <= 1000 )
+					{
+						if(n <= 100 )
+						{
+							n_threads_ideal = 32;
+						}
+						else
+						{
+							n_threads_ideal = 48;
+						}
+					}
+					else if( m <= 10000 )
+					{
+						if(n <= 200 )
+						{
+							n_threads_ideal = 48;
+						}
+						else if( n <= 500 )
+						{
+							n_threads_ideal = 96;
+						}
+						else if( n <= 600 )
+						{
+							n_threads_ideal = 144;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( m <= 20000 && n <= 750 )
+					{
+						n_threads_ideal = 168;
+					}
+					else
+					{
+						n_threads_ideal = 192;
+					}
+				}
 			}
-			else if(m > 210)
+			else if(m < n)
 			{
-				if(n < 10)  	   n_threads_ideal = 4;
-				else if(n <= 512)  n_threads_ideal = 8;
-				else if(n <= 1024) n_threads_ideal = 16;
-				else if(n <= 2048) n_threads_ideal = 32;
-				else 			   n_threads_ideal = 64;
+				if(k <= 32)
+				{
+					if( n <= 1000 )
+					{
+						n_threads_ideal = 8;
+					}
+					else if( n <= 10000 )
+					{
+						if( m <= 500 )
+						{
+							n_threads_ideal = 16;
+						}
+						else if( m <= 1000 )
+						{
+							n_threads_ideal = 32;
+						}
+						else
+						{
+							n_threads_ideal = 96;
+						}
+					}
+					else
+					{
+						n_threads_ideal = 96;
+					}
+				}
+				else if(k <= 64)
+				{
+					if( (n <= 100) || (n <= 500 && m <= 100) )
+					{
+						n_threads_ideal = 8;
+					}
+					else if(n <= 500)
+					{
+						n_threads_ideal = 16;
+					}
+					else if( n <= 1000 )
+					{
+						if( m <= 200)
+						{
+							n_threads_ideal = 16;
+						}
+						else
+						{
+							n_threads_ideal = 32;
+						}
+					}
+					else if( n <= 10000 )
+					{
+						if( m <= 100)
+						{
+							n_threads_ideal = 32;
+						}
+						else if( m <= 500)
+						{
+							n_threads_ideal = 48;
+						}
+						else if( m <= 1000)
+						{
+							n_threads_ideal = 96;
+						}
+						else if(m <= 2500)
+						{
+							n_threads_ideal = 128;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( n <= 20000 )
+					{
+						if( m < 1000 )
+						{
+							n_threads_ideal = 128;
+						}
+						else if( m < 2500 )
+						{
+							n_threads_ideal = 144;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( n <= 30000)
+					{
+						if( m < 1000 )
+						{
+							n_threads_ideal = 168;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( n <= 40000 )
+					{
+						if(m < 600)
+						{
+							n_threads_ideal = 144;
+						}
+						else if(m < 750)
+						{
+							n_threads_ideal = 168;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else
+					{
+						n_threads_ideal = 192;
+					}
+				}
+				else if(k <= 128)
+				{
+					if( (n <= 100) || (n <= 500 && m <= 50) )
+					{
+						n_threads_ideal = 8;
+					}
+					else if(n <= 500 )
+					{
+						if( m <= 100)
+						{
+							n_threads_ideal = 16;
+						}
+						else
+						{
+							n_threads_ideal = 32;
+						}
+					}
+					else if( n <= 1000 )
+					{
+						if( m <= 50)
+						{
+							n_threads_ideal = 16;
+						}
+						else
+						{
+							n_threads_ideal = 32;
+						}
+					}
+					else if( n <= 10000 )
+					{
+						if(m <= 100 )
+						{
+							n_threads_ideal = 32;
+						}
+						else if(m <= 200 )
+						{
+							n_threads_ideal = 64;
+						}
+						else if(m <= 500 )
+						{
+							n_threads_ideal = 72;
+						}
+						else if(m < 1000 )
+						{
+							n_threads_ideal = 96;
+						}
+						else if(m < 2500 )
+						{
+							n_threads_ideal = 168;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( n <= 20000 )
+					{
+						if(m <= 500 )
+						{
+							n_threads_ideal = 96;
+						}
+						else if(m < 1000 )
+						{
+							n_threads_ideal = 128;
+						}
+						else if(m < 2500 )
+						{
+							n_threads_ideal = 144;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( n <= 30000 )
+					{
+						if(m <= 500 )
+						{
+							n_threads_ideal = 96;
+						}
+						else if(m < 750 )
+						{
+							n_threads_ideal = 128;
+						}
+						else if(m < 1000 )
+						{
+							n_threads_ideal = 168;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else if( n <= 40000 )
+					{
+						if(m < 500 )
+						{
+							n_threads_ideal = 128;
+						}
+						else if(m < 600 )
+						{
+							n_threads_ideal = 144;
+						}
+						else if(m < 750 )
+						{
+							n_threads_ideal = 168;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else
+					{
+						n_threads_ideal = 192;
+					}
+				}
+				else
+				{
+					if(n <= 100)
+					{
+						n_threads_ideal = 8;
+					}
+					else if( n <= 500 )
+					{
+						if( m <= 100)
+						{
+							n_threads_ideal = 16;
+						}
+						else
+						{
+							n_threads_ideal = 32;
+						}
+					}
+					else if( n <= 1000 )
+					{
+						if( m <= 100)
+						{
+							n_threads_ideal = 32;
+						}
+						else
+						{
+							n_threads_ideal = 48;
+						}
+					}
+					else if( n <= 10000 )
+					{
+						if( m <= 50)
+						{
+							n_threads_ideal = 48;
+						}
+						else if(m <= 100)
+						{
+							n_threads_ideal = 64;
+						}
+						else if(m < 750)
+						{
+							n_threads_ideal = 96;
+						}
+						else
+						{
+							n_threads_ideal = 192;
+						}
+					}
+					else
+					{
+						n_threads_ideal = 192;
+					}
+				}
 			}
-			else if(m > 150)
+			else if(m == n)
 			{
-				if(n < 10)  	   n_threads_ideal = 2;
-				else if(n <= 512)  n_threads_ideal = 8;
-				else if(n <= 1024) n_threads_ideal = 16;
-				else if(n <= 2048) n_threads_ideal = 32;
-				else 			   n_threads_ideal = 64;
+				if(k <= 32)
+				{
+					if( m <= 20 )        n_threads_ideal = 1;
+					else if( m <= 40 )   n_threads_ideal = 4;
+					else if( m <= 800 )  n_threads_ideal = 8;
+					else if( m <= 1000 ) n_threads_ideal = 16;
+					else if( m <= 5000 ) n_threads_ideal = 64;
+					else                 n_threads_ideal = 96;
+				}
+				else if(k <= 64)
+				{
+					if(m <= 150) n_threads_ideal = 8;
+					else if(m <= 1000) n_threads_ideal = 16;
+					else if( m <= 2500) n_threads_ideal = 96;
+					else if( m <= 5000) n_threads_ideal = 128;
+					else if( m <= 6000) n_threads_ideal = 128;
+					else n_threads_ideal = 192;
+				}
+				else if( k <= 128)
+				{
+					if( m <= 100) n_threads_ideal = 8;
+					else if(m <= 500) n_threads_ideal = 32;
+					else if( m <= 1000) n_threads_ideal = 64;
+					else if( m <= 5000) n_threads_ideal = 144;
+					else n_threads_ideal = 192;
+				}
+				else
+				{
+					if( m <= 100) n_threads_ideal = 8;
+					else if( m <= 250 ) n_threads_ideal = 32;
+					else if( m <= 500 ) n_threads_ideal = 48;
+					else if( m <= 1000) n_threads_ideal = 96;
+					else n_threads_ideal = 192;
+				}
 			}
-			else if( ( m < 34) && (k < 68) && ( n < 34))
+		}
+		else
+		{
+			if( k >= 128)
 			{
-				n_threads_ideal = 1;
+				if(n <= 15)
+				{
+					if(m < 128) 	 n_threads_ideal = 8;
+					else if(m < 256) n_threads_ideal = 16;
+					else if(m < 512) n_threads_ideal = 32;
+					else 			 n_threads_ideal = 64;
+				}
+				else if (n <= 64)
+				{
+					if(m < 128) 	 n_threads_ideal = 16;
+					else if(m < 256) n_threads_ideal = 32;
+					else 			 n_threads_ideal = 64;
+				}
+				else
+				{
+					if(m < 256) n_threads_ideal = 32;
+					else 		n_threads_ideal = 64;
+				}
 			}
 			else
-			{	//(m<150 && k<128)
-				if(n < 20) n_threads_ideal = 1;
-				if(n < 64) n_threads_ideal = 4;
-				else	   n_threads_ideal = 8;
+			{
+				if(m > 10000)
+				{
+					// current logic is only limiting threads to
+					// less or equal to 64 - limits performance.
+					// To deal with larger matrix sizes we need to use
+					// large number of threads to improve performance
+					// Need to derive this upperTH - and
+					// if matrix -sizes are larger and user wants
+					// to use higher number of threads - that should be allowed.
+
+					// if (n > UpperTH) n_threads_ideal = n_threads;
+					if (n > 200 )	    n_threads_ideal = 64;
+					else if ( n > 120 ) n_threads_ideal = 32;
+					else if ( n > 40  ) n_threads_ideal = 16;
+					else if ( n > 10  ) n_threads_ideal = 8;
+					else 				n_threads_ideal = 4;
+				}
+				else if( m > 1000)
+				{
+					if (n <= 10) 		  n_threads_ideal = 4;
+					else if ( n <= 512 )  n_threads_ideal = 8;
+					else if ( n <= 1024 ) n_threads_ideal = 16;
+					else if ( n <= 2048 ) n_threads_ideal = 32;
+					else 				  n_threads_ideal = 64;
+				}
+				else if(m > 210)
+				{
+					if(n < 10)  	   n_threads_ideal = 4;
+					else if(n <= 512)  n_threads_ideal = 8;
+					else if(n <= 1024) n_threads_ideal = 16;
+					else if(n <= 2048) n_threads_ideal = 32;
+					else 			   n_threads_ideal = 64;
+				}
+				else if(m > 150)
+				{
+					if(n < 10)  	   n_threads_ideal = 2;
+					else if(n <= 512)  n_threads_ideal = 8;
+					else if(n <= 1024) n_threads_ideal = 16;
+					else if(n <= 2048) n_threads_ideal = 32;
+					else 			   n_threads_ideal = 64;
+				}
+				else if( ( m < 34) && (k < 68) && ( n < 34))
+				{
+					n_threads_ideal = 1;
+				}
+				else
+				{	//(m<150 && k<128)
+					if(n < 20) n_threads_ideal = 1;
+					if(n < 64) n_threads_ideal = 4;
+					else	   n_threads_ideal = 8;
+				}
 			}
-		  }
+		}
 	}
 	else if( family == BLIS_GEMM && bli_obj_is_dcomplex(c))
 	{
@@ -885,13 +1476,18 @@ void bli_nthreads_optimum(
 	// for updating rntm
 	bli_rntm_set_num_threads_only( n_threads_opt, rntm );
 
+#ifdef PRINT_THREADING
+	printf( "bli_nthreads_optimum()\n" );
+	bli_rntm_print( rntm );
+#endif
+
 	return;
 }
 
 // Calculates the optimum number of threads along with the factorization
 // (ic, jc) using m, n, k dimensions. This function modifies only the local
-// copy of rntm with optimum threads. Since global rntm remains unchanged the
-// num_threads set by application is available in global_rntm data structure.
+// copy of rntm with optimum threads. Since tl_rntm remains unchanged the
+// num_threads set by application is available in tl_rntm data structure.
 err_t bli_smart_threading_sup
 				(
 				 obj_t*  a,
@@ -967,4 +1563,439 @@ err_t bli_smart_threading_sup
 	}
 	return ret_val;
 }
+
+/*
+	Functionality:
+	--------------
+	This function decides the AOCL dynamic logic for L1 dscalv API based on the
+	architecture ID and size of the input variable.
+
+	Function signature
+	-------------------
+
+	This function takes the following input:
+
+	* 'arch_id' - Architecture ID of the system (copy of BLIS global arch id)
+	* 'n_elem' - Number of elements in the vector
+	* 'nt_ideal' - Ideal number of threads
+
+	The function has been made static to restrict its scope.
+
+	Exception
+	----------
+
+	1. For non-Zen architectures, return -1. The expectation is that this is handled
+	   in the higher layer
+*/
+static void aocl_dscalv_dynamic
+     (
+       arch_t arch_id,
+       dim_t  n_elem,
+       dim_t* nt_ideal
+     )
+{
+
+	/*
+		Pick the AOCL dynamic logic based on the
+		architecture ID
+	*/
+	switch (arch_id)
+	{
+		case BLIS_ARCH_ZEN4:
+		case BLIS_ARCH_ZEN:
+		case BLIS_ARCH_ZEN2:
+		case BLIS_ARCH_ZEN3:
+
+			if ( n_elem <= 10000 )
+				*nt_ideal = 1;
+			else if (n_elem <= 20000)
+				*nt_ideal = 2;
+			else if (n_elem <= 50000)
+				*nt_ideal = 4;
+			else
+				*nt_ideal = 8;
+
+			break;
+
+		default:
+			/*
+				Without this default condition, compiler will throw
+				a warning saying other conditions are not handled
+			*/
+
+			/*
+				For other architectures, AOCL dynamic does not make any change
+			*/
+			*nt_ideal = -1;
+	}
+}
+
+/*
+	Functionality:
+	--------------
+	This function decides the AOCL dynamic logic for L1 zdscalv API based on the
+	architecture ID and size of the input variable.
+
+	Function signature
+	-------------------
+
+	This function takes the following input:
+
+	* 'arch_id' - Architecture ID of the system (copy of BLIS global arch id)
+	* 'n_elem' - Number of elements in the vector
+	* 'nt_ideal' - Ideal number of threads
+
+	The function has been made static to restrict its scope.
+
+	Exception
+	----------
+
+	1. For non-Zen architectures, return -1. The expectation is that this is handled
+	   in the higher layer
+*/
+static void aocl_zdscalv_dynamic
+     (
+       arch_t arch_id,
+       dim_t  n_elem,
+       dim_t* nt_ideal
+     )
+{
+
+	/*
+		Pick the AOCL dynamic logic based on the
+		architecture ID
+	*/
+	switch (arch_id)
+	{
+		case BLIS_ARCH_ZEN4:
+		case BLIS_ARCH_ZEN:
+		case BLIS_ARCH_ZEN2:
+		case BLIS_ARCH_ZEN3:
+
+			if ( n_elem <= 10000)
+				*nt_ideal = 1;
+			else if (n_elem <= 20000)
+				*nt_ideal = 4;
+			else if (n_elem <= 1000000)
+				*nt_ideal = 8;
+			else if (n_elem <= 2500000)
+				*nt_ideal = 12;
+			else if (n_elem <= 5000000)
+				*nt_ideal = 32;
+			else
+				*nt_ideal = 64;
+
+			break;
+
+		default:
+			/*
+				Without this default condition, compiler will throw
+				a warning saying other conditions are not handled
+			*/
+
+			/*
+				For other architectures, AOCL dynamic does not make any change
+			*/
+			*nt_ideal = -1;
+	}
+}
+
+/*
+	Functionality:
+	--------------
+	This function decides the AOCL dynamic logic for L1 daxpyv API based on the
+	architecture ID and size of the input variable.
+
+	Function signature
+	-------------------
+
+	This function takes the following input:
+
+	* 'arch_id' - Architecture ID of the system (copy of BLIS global arch id)
+	* 'n_elem' - Number of elements in the vector
+	* 'nt_ideal' - Ideal number of threads
+
+	The function has been made static to restrict its scope.
+
+	Exception
+	----------
+
+	1. For non-Zen architectures, return -1. The expectation is that this is handled
+	   in the higher layer
+*/
+static void aocl_daxpyv_dynamic
+     (
+       arch_t arch_id,
+       dim_t  n_elem,
+       dim_t* nt_ideal
+     )
+{
+	/*
+		Pick the AOCL dynamic logic based on the
+		architecture ID
+	*/
+	switch (arch_id)
+	{
+		case BLIS_ARCH_ZEN4:
+		case BLIS_ARCH_ZEN:
+		case BLIS_ARCH_ZEN2:
+		case BLIS_ARCH_ZEN3:
+
+			if ( n_elem <= 100 )
+				*nt_ideal = 1;
+			else if (n_elem <= 10000)
+				*nt_ideal = 2;
+			else if (n_elem <= 250000)
+				*nt_ideal = 8;
+			else if (n_elem <= 750000)
+				*nt_ideal = 16;
+			else if (n_elem <= 2000000)
+				*nt_ideal = 32;
+			else
+				// For sizes in this range, AOCL dynamic does not make any change
+				*nt_ideal = -1;
+
+			break;
+
+		default:
+			/*
+				Without this default condition, compiler will throw
+				a warning saying other conditions are not handled
+			*/
+
+			/*
+				For other architectures, AOCL dynamic does not make any change
+			*/
+			*nt_ideal = -1;
+	}
+}
+
+/*
+	Functionality:
+	--------------
+	This function decides the AOCL dynamic logic for L1 ddotv API based on the
+	architecture ID and size of the input variable.
+
+	Function signature
+	-------------------
+
+	This function takes the following input:
+
+	* 'arch_id' - Architecture ID of the system (copy of BLIS global arch id)
+	* 'n_elem' - Number of elements in the vector
+	* 'nt_ideal' - Ideal number of threads
+
+	The function has been made static to restrict its scope.
+
+	Exception
+	----------
+
+	1. For non-Zen architectures, return -1. The expectation is that this is handled
+	   in the higher layer
+*/
+static void aocl_ddotv_dynamic
+     (
+       arch_t arch_id,
+       dim_t  n_elem,
+       dim_t* nt_ideal
+     )
+{
+	/*
+		Pick the AOCL dynamic logic based on the
+		architecture ID
+	*/
+	switch (arch_id)
+	{
+		case BLIS_ARCH_ZEN4:
+		case BLIS_ARCH_ZEN:
+		case BLIS_ARCH_ZEN2:
+		case BLIS_ARCH_ZEN3:
+
+			if ( n_elem <= 2500 )
+				*nt_ideal = 1;
+			else if (n_elem <= 5000)
+				*nt_ideal = 4;
+			else if (n_elem <= 15000)
+				*nt_ideal = 8;
+			else if (n_elem <= 40000)
+				*nt_ideal = 16;
+			else if (n_elem <= 200000)
+				*nt_ideal = 32;
+			else
+				// For sizes in this range, AOCL dynamic does not make any change
+				*nt_ideal = -1;
+
+			break;
+
+		default:
+			/*
+				Without this default condition, compiler will throw
+				a warning saying other conditions are not handled
+			*/
+
+			/*
+				For other architectures, AOCL dynamic does not make any change
+			*/
+			*nt_ideal = -1;
+	}
+}
+
 #endif // AOCL_DYNAMIC
+
+/*
+	Functionality:
+	--------------
+
+	This function does the following:
+	1. Reads the number of threads requested by the user from the rntm variable
+	2. Acts as the gateway to the AOCL dynamic logic if AOCL dynamic is enabled
+	   and alters the count of the number of threads accordingly
+
+	Function signature
+	-------------------
+
+	This function takes the following input:
+
+	* 'ker_id' - ID of kernel invoking this function
+	* 'datatype_a' - Datatype 1 of kernel
+	* 'datatype_b' - Datatype 2 of kernel
+	* 'arch_id' - Architecture ID of the system (copy of BLIS global arch id)
+	* 'n_elem' - Number of elements in the vector
+	* 'nt_ideal' - Ideal number of threads
+
+	Exception
+	----------
+
+	None
+*/
+void bli_nthreads_l1
+     (
+       l1vkr_t  ker_id,
+       num_t    data_type_a,
+       num_t    data_type_b,
+       arch_t   arch_id,
+       dim_t    n_elem,
+       dim_t*   nt_ideal
+     )
+{
+#ifdef AOCL_DYNAMIC
+	/*
+		This code sections dispatches the AOCL dynamic logic kernel for
+		L1 APIs based on the kernel ID and the data type.
+	*/
+	// Function pointer to AOCL Dynamic logic kernel
+	void (*aocl_dynamic_func_l1)(arch_t, dim_t, dim_t* ) = NULL;
+
+	// Pick the aocl dynamic thread decision kernel based on the kernel ID
+	switch (ker_id)
+	{
+		case BLIS_SCALV_KER:
+
+			/*
+				When input data types do not match the call is from mixed precision
+			*/
+			if (data_type_a != data_type_b)
+			{
+				// Function for ZDSCALV
+				aocl_dynamic_func_l1 = aocl_zdscalv_dynamic;
+			}
+			else
+			{
+				// Function for DSCALV
+				aocl_dynamic_func_l1 = aocl_dscalv_dynamic;
+			}
+
+			break;
+
+		case BLIS_AXPYV_KER:
+
+			// Function for DAXPYV
+			aocl_dynamic_func_l1 = aocl_daxpyv_dynamic;
+
+			break;
+
+		case BLIS_DOTV_KER:
+
+			// Function for DDOTV
+			aocl_dynamic_func_l1 = aocl_ddotv_dynamic;
+
+			break;
+
+		default:
+			/*
+				For kernels that do no have AOCL dynamic logic,
+				use the number of threads requested by the user.
+			*/
+			*nt_ideal = -1;
+	}
+
+	/*
+		For APIs that do not have AOCL dynamic
+		logic, aocl_dynamic_func_l1 will be NULL.
+	*/
+	if( aocl_dynamic_func_l1 != NULL)
+	{
+		// Call the AOCL dynamic logic kernel
+		aocl_dynamic_func_l1
+		(
+			arch_id,
+			n_elem,
+			nt_ideal
+		);
+
+		if (*nt_ideal == 1)
+		{
+			// Return early when the number of threads is 1
+			return;
+		}
+	}
+
+#endif
+	// Initialized to avoid compiler warning
+	rntm_t rntm_local;
+
+	// Initialize a local runtime with global settings.
+	bli_rntm_init_from_global(&rntm_local);
+
+	// Query the total number of threads from the rntm_t object.
+	dim_t nt_rntm = bli_rntm_num_threads(&rntm_local);
+
+	if (nt_rntm <= 0)
+	{
+		// nt is less than one if BLIS manual setting of parallelism
+		// has been used. Parallelism here will be product of values.
+		nt_rntm = bli_rntm_calc_num_threads(&rntm_local);
+	}
+
+#ifdef AOCL_DYNAMIC
+
+	// Calculate the actual number of threads that will be spawned
+	if (*nt_ideal != -1)
+	{
+		// The if block is executed for all Zen architectures
+		*nt_ideal = bli_min(nt_rntm, *nt_ideal);
+	}
+	else
+	{
+		/*
+			For non-Zen architectures and very large sizes,
+			spawn the actual number of threads requested
+		*/
+		*nt_ideal = nt_rntm;
+	}
+
+	/*
+	  When the number of element to be processed is less
+	  than the number of threads spawn n_elem number of threads.
+	*/
+	if (n_elem < *nt_ideal)
+	{
+		*nt_ideal = n_elem;
+	}
+#else
+
+	// Calculate the actual number of threads that will be spawned
+	*nt_ideal = nt_rntm;
+
+#endif
+}
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index c45184c57d..9105cef57d 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -6,7 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -37,6 +37,81 @@
 #ifndef BLIS_RNTM_H
 #define BLIS_RNTM_H
 
+// Define this to print information about threading in rntm structures.
+//#define PRINT_THREADING
+
+
+// Function prototypes
+
+BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
+
+BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
+     (
+       opid_t  l3_op,
+       side_t  side,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       rntm_t* rntm
+     );
+
+void bli_rntm_set_ways_from_rntm
+     (
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       rntm_t* rntm
+     );
+
+void bli_rntm_set_ways_from_rntm_sup
+     (
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       rntm_t* rntm
+     );
+
+void bli_rntm_print
+     (
+       rntm_t* rntm
+     );
+
+dim_t bli_rntm_calc_num_threads_in
+     (
+       bszid_t* restrict bszid_cur,
+       rntm_t*  restrict rntm
+     );
+
+#ifdef AOCL_DYNAMIC
+void bli_nthreads_optimum
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       opid_t  family,
+       rntm_t* rntm
+     );
+
+err_t bli_smart_threading_sup
+     (
+       obj_t*  a,
+       obj_t*  b,
+       obj_t*  c,
+       opid_t  family,
+       rntm_t* rntm,
+       cntx_t* cntx
+     );
+#endif
+
+void bli_nthreads_l1
+     (
+       l1vkr_t ker_id,
+       num_t   data_type_a,
+       num_t   data_type_b,
+       arch_t  arch_id,
+       dim_t   n_elem,
+       dim_t*  nt_ideal
+     );
 
 // Runtime object type (defined in bli_type_defs.h)
 
@@ -248,6 +323,17 @@ BLIS_INLINE void bli_rntm_set_num_threads( dim_t nt, rntm_t* rntm )
 
 	// Set the individual ways of parallelism to default states.
 	bli_rntm_clear_ways_only( rntm );
+
+	// BLIS_NUM_THREADS env variable or BLIS API to set the
+	// number of threads is used. Setting the blis_mt flag to TRUE
+	// so that OMP API or OMP env variables will not be of effect
+	// going forward.
+	bli_rntm_set_blis_mt_only(TRUE, rntm);
+
+#ifdef PRINT_THREADING
+	printf( "bli_rntm_set_num_threads()\n" );
+	bli_rntm_print( rntm );
+#endif
 }
 
 BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir, rntm_t* rntm )
@@ -262,6 +348,17 @@ BLIS_INLINE void bli_rntm_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_
 
 	// Set the num_threads field to a default state.
 	bli_rntm_clear_num_threads_only( rntm );
+
+	// BLIS_NUM_THREADS env variable or BLIS API to set the
+	// number of threads is used. Setting the blis_mt flag to TRUE
+	// so that OMP API or OMP env variables will not be of effect
+	// going forward.
+	bli_rntm_set_blis_mt_only(TRUE, rntm);
+
+#ifdef PRINT_THREADING
+	printf( "bli_rntm_set_ways()\n" );
+	bli_rntm_print( rntm );
+#endif
 }
 
 BLIS_INLINE void bli_rntm_set_pack_a( bool pack_a, rntm_t* rntm )
@@ -322,6 +419,7 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
           .pack_a      = FALSE, \
           .pack_b      = FALSE, \
           .l3_sup      = TRUE, \
+          .blis_mt     = FALSE, \
           .sba_pool    = NULL, \
           .membrk      = NULL, \
         }  \
@@ -335,6 +433,7 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 	bli_rntm_clear_pack_a( rntm );
 	bli_rntm_clear_pack_b( rntm );
 	bli_rntm_clear_l3_sup( rntm );
+	bli_rntm_set_blis_mt_only(FALSE, rntm);
 
 	bli_rntm_clear_sba_pool( rntm );
 	bli_rntm_clear_membrk( rntm );
@@ -360,67 +459,5 @@ BLIS_INLINE dim_t bli_rntm_calc_num_threads
 
 // -----------------------------------------------------------------------------
 
-// Function prototypes
-
-BLIS_EXPORT_BLIS void bli_rntm_init_from_global( rntm_t* rntm );
-
-BLIS_EXPORT_BLIS void bli_rntm_set_ways_for_op
-     (
-       opid_t  l3_op,
-       side_t  side,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       rntm_t* rntm
-     );
-
-void bli_rntm_set_ways_from_rntm
-     (
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       rntm_t* rntm
-     );
-
-void bli_rntm_set_ways_from_rntm_sup
-     (
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       rntm_t* rntm
-     );
-
-void bli_rntm_print
-     (
-       rntm_t* rntm
-     );
-
-dim_t bli_rntm_calc_num_threads_in
-     (
-       bszid_t* restrict bszid_cur,
-       rntm_t*  restrict rntm
-     );
-
-#ifdef AOCL_DYNAMIC
-void bli_nthreads_optimum
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       opid_t  family,
-       rntm_t* rntm
-     );
-
-err_t bli_smart_threading_sup
-     (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       opid_t  family,
-       rntm_t* rntm,
-       cntx_t* cntx
-     );
-#endif
-
 #endif
 
diff --git a/frame/compat/bla_amax.c b/frame/compat/bla_amax.c
index b1cf77e7b8..855f32b43c 100644
--- a/frame/compat/bla_amax.c
+++ b/frame/compat/bla_amax.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,7 +41,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype_x, chx, blasname, blisname ) \
 \
-f77_int PASTEF772(i,chx,blasname) \
+f77_int PASTEF772S(i,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
@@ -95,8 +95,17 @@ f77_int PASTEF772(i,chx,blasname) \
 \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     return f77_index; \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+f77_int PASTEF772(i,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+  return PASTEF772S(i,chx,blasname)( n, x, incx );\
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( amax, amaxv )
-#endif
diff --git a/frame/compat/bla_amax.h b/frame/compat/bla_amax.h
index 1f13715dc4..c47e60ec2e 100644
--- a/frame/compat/bla_amax.h
+++ b/frame/compat/bla_amax.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,13 +40,19 @@
 #undef  GENTPROT
 #define GENTPROT( ftype_x, chx, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS f77_int PASTEF772S(i,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( amax )
-#endif
 
diff --git a/frame/compat/bla_amax_amd.c b/frame/compat/bla_amax_amd.c
index 2f7c2d2491..a4d616f37a 100644
--- a/frame/compat/bla_amax_amd.c
+++ b/frame/compat/bla_amax_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,7 +41,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype_x, chx, blasname, blisname ) \
 \
-f77_int PASTEF772(i,chx,blasname) \
+f77_int PASTEF772S(i,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
@@ -95,11 +95,20 @@ f77_int PASTEF772(i,chx,blasname) \
 \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     return f77_index; \
-}
-
-#ifdef BLIS_ENABLE_BLAS
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+f77_int PASTEF772(i,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+  return PASTEF772S(i,chx,blasname)( n, x, incx );\
+} \
+)
 
-f77_int isamax_
+f77_int isamax_blis_impl
      (
        const f77_int* n,
        const float* x, const f77_int* incx
@@ -158,9 +167,9 @@ f77_int isamax_
         incx0 = ( inc_t )(*incx);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
         cntx_t* cntx = bli_gks_query_cntx();
         samaxv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_AMAXV_KER, cntx );
@@ -197,8 +206,17 @@ f77_int isamax_
 
     return f77_index;
 }
-
-f77_int idamax_
+#ifdef BLIS_ENABLE_BLAS
+f77_int isamax_
+     (
+       const f77_int* n,
+       const float* x, const f77_int* incx
+     )
+{
+  return isamax_blis_impl( n, x, incx ); 
+}
+#endif
+f77_int idamax_blis_impl
      (
        const f77_int* n,
        const double* x, const f77_int* incx
@@ -213,27 +231,43 @@ f77_int idamax_
     gint_t   bli_index;
     f77_int  f77_index;
 
-    /* If the vector is empty, return an index of zero. This early check
-       is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
-       return 0, which ends up getting incremented to 1 (below) before
-       being returned, which is not what we want. */
+    /*
+      If the vector is empty, return an index of zero. This early check
+      is needed to emulate netlib BLAS. Without it, bli_?amaxv() will
+      return 0, which ends up getting incremented to 1 (below) before
+      being returned, which is not what we want.
+    */
     if ( *n < 1 || *incx <= 0 ) {
       AOCL_DTL_TRACE_EXIT_ERR(AOCL_DTL_LEVEL_TRACE_1, "idamax_: vector empty");
       return 0;
     }
 
+    /*
+      When the length of the vector is one it is going to be the element with
+      the maximum absolute value. This early return condition is defined in
+      the BLAS standard.
+    */
+    if(*n == 1)
+    {
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+      return 1;
+    }
+
     /* Initialize BLIS. */
-//  bli_init_auto();
+    //  bli_init_auto();
 
     /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
+    if ( *n < 0 )     n0 = ( dim_t )0;
     else              n0 = ( dim_t )(*n);
 
-    /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
+    /*
+      If the input increments are negative, adjust the pointers so we can
+      use positive increments instead.
+    */
     if ( *incx < 0 )
     {
-        /* The semantics of negative stride in BLAS are that the vector
+      /*
+        The semantics of negative stride in BLAS are that the vector
         operand be traversed in reverse order. (Another way to think
         of this is that negative strides effectively reverse the order
         of the vector, but without any explicit data movements.) This
@@ -244,10 +278,11 @@ f77_int idamax_
         used *relative* to the vector address as it is given. Thus, in
         BLIS, if this backwards traversal is desired, the caller *must*
         pass in the address to the (n-1)th (i.e., the bottom-most or
-        right-most) element along with a negative stride. */
+        right-most) element along with a negative stride.
+      */
 
-        x0    = ((double*)x) + (n0-1)*(-*incx);
-        incx0 = ( inc_t )(*incx);
+      x0    = ((double*)x) + (n0-1)*(-*incx);
+      incx0 = ( inc_t )(*incx);
 
     }
     else
@@ -256,44 +291,65 @@ f77_int idamax_
         incx0 = ( inc_t )(*incx);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
-    {
-        cntx_t* cntx = bli_gks_query_cntx();
-        damaxv_ker_ft f = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_AMAXV_KER, cntx );
-        /* Call BLIS kernel */
-        f
-        (
-            n0,
-            x0, incx0,
-            &bli_index,
-            NULL
-        );
-    }
-    else
+    cntx_t* cntx = NULL;
+
+    // Query the architecture ID
+    arch_t id = bli_arch_query_id();
+
+    damaxv_ker_ft amaxv_fun_ptr;
+
+    // Pick the kernel based on the architecture ID
+    switch (id)
     {
-      PASTEMAC2(d,amaxv,BLIS_TAPI_EX_SUF)
-      (
-        n0,
-        x0, incx0,
-        &bli_index,
-        NULL,
-        NULL
-      );
+      case BLIS_ARCH_ZEN4:
+      case BLIS_ARCH_ZEN:
+      case BLIS_ARCH_ZEN2:
+      case BLIS_ARCH_ZEN3:
+
+          // AVX2 Kernel
+          amaxv_fun_ptr = bli_damaxv_zen_int;
+          break;
+
+      default:
+
+          // Query the context
+          cntx = bli_gks_query_cntx();
+
+          // Query the function pointer using the context
+          amaxv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_AMAXV_KER, cntx);
     }
 
-    /* Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
-       index. Also, if the BLAS integer size differs from the BLIS
-       integer size, that typecast occurs here. */
+    // Call BLIS kernel based on the function pointer
+    amaxv_fun_ptr
+    (
+      n0,
+      x0, incx0,
+      &bli_index,
+      cntx
+    );
+
+    /*
+      Convert zero-based BLIS (C) index to one-based BLAS (Fortran)
+      index. Also, if the BLAS integer size differs from the BLIS
+      integer size, that typecast occurs here.
+    */
     f77_index = bli_index + 1;
 
     /* Finalize BLIS. */
-//    bli_finalize_auto();
+    // bli_finalize_auto();
+
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
     return f77_index;
 }
-
+#ifdef BLIS_ENABLE_BLAS
+f77_int idamax_
+     (
+       const f77_int* n,
+       const double* x, const f77_int* incx
+     )
+{
+  return idamax_blis_impl( n, x, incx ); 
+}
+#endif
 INSERT_GENTFUNC_BLAS_CZ( amax, amaxv )
 
-#endif
diff --git a/frame/compat/bla_amin.c b/frame/compat/bla_amin.c
index 7930fc1854..9ea18780b9 100644
--- a/frame/compat/bla_amin.c
+++ b/frame/compat/bla_amin.c
@@ -4,8 +4,8 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -41,7 +41,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype_x, chx, blasname, blisname ) \
 \
-f77_int PASTEF772(i,chx,blasname) \
+f77_int PASTEF772S(i,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
@@ -88,8 +88,17 @@ f77_int PASTEF772(i,chx,blasname) \
     bli_finalize_auto(); \
 \
     return f77_index; \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+f77_int PASTEF772(i,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+  return PASTEF772S(i,chx,blasname)( n, x, incx );\
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
-    INSERT_GENTFUNC_BLAS( amin, aminv )
-#endif
+INSERT_GENTFUNC_BLAS( amin, aminv )
diff --git a/frame/compat/bla_amin.h b/frame/compat/bla_amin.h
index ebbed8262b..2d24e6dcba 100644
--- a/frame/compat/bla_amin.h
+++ b/frame/compat/bla_amin.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,12 +39,18 @@
 #undef  GENTPROT
 #define GENTPROT( ftype_x, chx, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS f77_int PASTEF772(i,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS f77_int PASTEF772S(i,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( amin )
-#endif
\ No newline at end of file
diff --git a/frame/compat/bla_asum.c b/frame/compat/bla_asum.c
index c104be96bd..0263dc46a2 100644
--- a/frame/compat/bla_asum.c
+++ b/frame/compat/bla_asum.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,7 +41,7 @@
 #undef  GENTFUNCR2
 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
 \
-ftype_r PASTEF772(chr,chx,blasname) \
+ftype_r PASTEF772S(chr,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
@@ -79,9 +79,18 @@ ftype_r PASTEF772(chr,chx,blasname) \
     bli_finalize_auto(); \
 \
     return asum; \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+ftype_r PASTEF772(chr,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+  return PASTEF772S(chr,chx,blasname)( n, x, incx );\
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCR2_BLAS( asum, asumv )
-#endif
 
diff --git a/frame/compat/bla_asum.h b/frame/compat/bla_asum.h
index a9ef27a036..774f53f220 100644
--- a/frame/compat/bla_asum.h
+++ b/frame/compat/bla_asum.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,13 +40,19 @@
 #undef  GENTPROTR2
 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS ftype_r PASTEF772S(chr,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTR2_BLAS( asum )
-#endif
 
diff --git a/frame/compat/bla_axpby.c b/frame/compat/bla_axpby.c
index be53ec480b..2c4fd140fa 100644
--- a/frame/compat/bla_axpby.c
+++ b/frame/compat/bla_axpby.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -41,7 +41,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        const ftype*   alpha, \
@@ -85,8 +85,21 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   beta, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF77S(ch,blasname) \
+     ( n, alpha, x, incx, beta, y, incy ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( axpby, axpbyv )
-#endif
diff --git a/frame/compat/bla_axpby.h b/frame/compat/bla_axpby.h
index ab2952be98..3ac58f830e 100644
--- a/frame/compat/bla_axpby.h
+++ b/frame/compat/bla_axpby.h
@@ -4,8 +4,8 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +39,18 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   beta, \
+             ftype*   y, const f77_int* incy \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        const ftype*   alpha, \
@@ -47,8 +58,6 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
        const ftype*   beta, \
              ftype*   y, const f77_int* incy \
      );
-
-#ifdef BLIS_ENABLE_BLAS
+     
 INSERT_GENTPROT_BLAS( axpby )
-#endif
 
diff --git a/frame/compat/bla_axpy.c b/frame/compat/bla_axpy.c
index 1a30f417b3..4c084f0808 100644
--- a/frame/compat/bla_axpy.c
+++ b/frame/compat/bla_axpy.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        const ftype*   alpha, \
@@ -83,10 +83,19 @@ void PASTEF77(ch,blasname) \
      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
         /* Finalize BLIS. */ \
      bli_finalize_auto();  \
-}
-
-#ifdef BLIS_ENABLE_BLAS
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF77S(ch,blasname)( n, alpha, x, incx, y, incy ) ; \
+} \
+)
 
 INSERT_GENTFUNC_BLAS( axpy, axpyv )
-
-#endif
diff --git a/frame/compat/bla_axpy.h b/frame/compat/bla_axpy.h
index 294a385c78..3f33134e62 100644
--- a/frame/compat/bla_axpy.h
+++ b/frame/compat/bla_axpy.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,17 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        const ftype*   alpha, \
@@ -47,7 +58,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*   y, const f77_int* incy \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( axpy )
-#endif
 
diff --git a/frame/compat/bla_axpy_amd.c b/frame/compat/bla_axpy_amd.c
index 8a9f0280c6..0e24d7d4a5 100644
--- a/frame/compat/bla_axpy_amd.c
+++ b/frame/compat/bla_axpy_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,6 +35,19 @@
 
 #include "blis.h"
 
+/*
+  Early return conditions
+  ------------------------
+
+  1. When n <= 0 where n is the length of the vector passed
+  2. When alpha == 0 where alpha is the scalar value by which the vector is
+     to be scaled
+
+  NaN propagation expectation
+  --------------------------
+
+  1. When alpha == NaN - Propogate the NaN to the vector
+*/
 
 //
 // Define BLAS-to-BLIS interfaces.
@@ -42,7 +55,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        const ftype*   alpha, \
@@ -83,11 +96,24 @@ void PASTEF77(ch,blasname) \
      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
         /* Finalize BLIS. */ \
      bli_finalize_auto();  \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF77S(ch,blasname)( n, alpha, x, incx, y, incy ) ; \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 
-void saxpy_
+
+void saxpy_blis_impl
 (
  const f77_int* n,
  const float*   alpha,
@@ -95,91 +121,137 @@ void saxpy_
  float*   y, const f77_int* incy
  )
 {
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float*)alpha, *incx, *incy)
-  dim_t  n0;
-  float* x0;
-  float* y0;
-  inc_t  incx0;
-  inc_t  incy0;
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'S', *n, (float *)alpha, *incx, *incy)
 
-  /* Initialize BLIS. */
-  //    bli_init_auto();
+    /*
+      BLAS exception: If the vector dimension is zero, or if alpha is zero, return early.
+    */
+    if ((*n) <= 0 || PASTEMAC(s, eq0)(*alpha))
+    {
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
 
-  /* Convert/typecast negative values of n to zero. */
-  if ( *n < 0 ) n0 = ( dim_t )0;
-  else              n0 = ( dim_t )(*n);
+      return;
+    }
 
-  /* If the input increments are negative, adjust the pointers so we can
-     use positive increments instead. */
-  if ( *incx < 0 )
+    dim_t n_elem;
+    float *x0;
+    float *y0;
+    inc_t incx0;
+    inc_t incy0;
+
+    /* Initialize BLIS. */
+    //    bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if (*n < 0)
+      n_elem = (dim_t)0;
+    else
+      n_elem = (dim_t)(*n);
+
+    /*
+      If the input increments are negative, adjust the pointers so we can
+      use positive increments instead.
+    */
+    if (*incx < 0)
     {
       /* The semantics of negative stride in BLAS are that the vector
-         operand be traversed in reverse order. (Another way to think
-         of this is that negative strides effectively reverse the order
-         of the vector, but without any explicit data movements.) This
-         is also how BLIS interprets negative strides. The differences
-         is that with BLAS, the caller *always* passes in the 0th (i.e.,
-         top-most or left-most) element of the vector, even when the
-         stride is negative. By contrast, in BLIS, negative strides are
-         used *relative* to the vector address as it is given. Thus, in
-         BLIS, if this backwards traversal is desired, the caller *must*
-         pass in the address to the (n-1)th (i.e., the bottom-most or
-         right-most) element along with a negative stride. */
-      x0    = ((float*)x) + (n0-1)*(-*incx);
-      incx0 = ( inc_t )(*incx);
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+      x0 = ((float *)x) + (n_elem - 1) * (-*incx);
+      incx0 = (inc_t)(*incx);
     }
-  else
+    else
     {
       x0    = ((float*)x);
       incx0 = ( inc_t )(*incx);
     }
-  if ( *incy < 0 )
+    if ( *incy < 0 )
     {
-      y0    = ((float*)y) + (n0-1)*(-*incy);
+      y0    = ((float*)y) + (n_elem-1)*(-*incy);
       incy0 = ( inc_t )(*incy);
     }
-  else
+    else
     {
       y0    = ((float*)y);
       incy0 = ( inc_t )(*incy);
     }
 
-  // This function is invoked on all architectures including ‘generic’.
-  // Non-AVX platforms will use the kernels derived from the context.
-  if (bli_cpuid_is_avx_supported() == TRUE)
-  {
-      bli_saxpyv_zen_int10
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (float*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL
-      );
+    cntx_t *cntx = NULL;
 
-  }
-  else
-  {
-      PASTEMAC2(s,axpyv,BLIS_TAPI_EX_SUF)
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (float*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL,
-        NULL
-      );
+    // Query the architecture ID
+    arch_t id = bli_arch_query_id();
 
-  }
-  /* Finalize BLIS. */
-  //    bli_finalize_auto();
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+    /*
+      Function pointer declaration for the function
+      that will be used by this API
+    */
+    saxpyv_ker_ft axpyv_ker_ptr; // DAXPYV
+
+    // Pick the kernel based on the architecture ID
+    switch (id)
+    {
+      case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+        axpyv_ker_ptr = bli_saxpyv_zen_int_avx512;
+
+        break;
+#endif
+      case BLIS_ARCH_ZEN:
+      case BLIS_ARCH_ZEN2:
+      case BLIS_ARCH_ZEN3:
+        axpyv_ker_ptr = bli_saxpyv_zen_int10;
+
+        break;
+      default:
+
+        // For non-Zen architectures, query the context
+        cntx = bli_gks_query_cntx();
+
+        // Query the context for the kernel function pointers for saxpyv
+        axpyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_AXPYV_KER, cntx);
+    }
+
+    // Call the function based on the function pointer assigned above
+    axpyv_ker_ptr
+    (
+      BLIS_NO_CONJUGATE,
+      n_elem,
+      (float *)alpha,
+      x0, incx0,
+      y0, incy0,
+      cntx
+    );
+
+    /* Finalize BLIS. */
+    //    bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
 }
 
-void daxpy_
+#ifdef BLIS_ENABLE_BLAS
+void saxpy_
+(
+ const f77_int* n,
+ const float*   alpha,
+ const float*   x, const f77_int* incx,
+ float*   y, const f77_int* incy
+ )
+{
+  saxpy_blis_impl( n, alpha, x, incx, y, incy ) ; 
+}
+#endif
+
+void daxpy_blis_impl
 (
  const f77_int* n,
  const double*   alpha,
@@ -187,24 +259,31 @@ void daxpy_
  double*   y, const f77_int* incy
  )
 {
-  dim_t  n0;
-  double* x0;
-  double* y0;
-  inc_t  incx0;
-  inc_t  incy0;
-
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy)
-  /* Initialize BLIS. */
-  //    bli_init_auto();
-
-  /* Convert/typecast negative values of n to zero. */
-  if ( *n < 0 ) n0 = ( dim_t )0;
-  else              n0 = ( dim_t )(*n);
+    dim_t  n_elem;
+    double* x0;
+    double* y0;
+    inc_t  incx0;
+    inc_t  incy0;
+
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_AXPY_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, (double*)alpha, *incx, *incy)
+    /* Initialize BLIS. */
+    // bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n_elem = ( dim_t )0;
+    else          n_elem = ( dim_t )(*n);
+
+    // BLAS exception to return early when n <= 0 or alpha is 0.0
+    if(*n <= 0 || bli_deq0(*alpha))
+    {
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+      return;
+    }
 
-  /* If the input increments are negative, adjust the pointers so we can
-     use positive increments instead. */
-  if ( *incx < 0 )
+    /* If the input increments are negative, adjust the pointers so we can
+      use positive increments instead. */
+    if ( *incx < 0 )
     {
       /* The semantics of negative stride in BLAS are that the vector
          operand be traversed in reverse order. (Another way to think
@@ -218,61 +297,157 @@ void daxpy_
          BLIS, if this backwards traversal is desired, the caller *must*
          pass in the address to the (n-1)th (i.e., the bottom-most or
          right-most) element along with a negative stride. */
-      x0    = ((double*)x) + (n0-1)*(-*incx);
+      x0    = ( (double*)x ) + ( n_elem - 1 ) * ( - (*incx) );
       incx0 = ( inc_t )(*incx);
     }
-  else
+    else
     {
       x0    = ((double*)x);
       incx0 = ( inc_t )(*incx);
     }
-  if ( *incy < 0 )
+    if ( *incy < 0 )
     {
-      y0    = ((double*)y) + (n0-1)*(-*incy);
+      y0    = ( (double*) y ) + ( n_elem - 1 )*( - (*incy) );
       incy0 = ( inc_t )(*incy);
     }
-  else
+    else
     {
       y0    = ((double*)y);
       incy0 = ( inc_t )(*incy);
     }
 
-  // This function is invoked on all architectures including ‘generic’.
-  // Non-AVX platforms will use the kernels derived from the context.
-  if (bli_cpuid_is_avx_supported() == TRUE)
-  {
-      bli_daxpyv_zen_int10
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (double*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL
-      );
+    // Definition of function pointer
+    daxpyv_ker_ft axpyv_ker_ptr;
 
-  }
-  else
-  {
-      PASTEMAC2(d,axpyv,BLIS_TAPI_EX_SUF)
-      (
-        BLIS_NO_CONJUGATE,
-        n0,
-        (double*)alpha,
-        x0, incx0,
-        y0, incy0,
-        NULL,
-        NULL
-      );
+    cntx_t *cntx = NULL;
 
-  }
+    // Query the architecture ID
+    arch_t arch_id_local = bli_arch_query_id();
 
-  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-  /* Finalize BLIS. */
-  //    bli_finalize_auto();
+    // Pick the kernel based on the architecture ID
+    switch (arch_id_local)
+    {
+      case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+        axpyv_ker_ptr = bli_daxpyv_zen_int_avx512;
+
+        break;
+#endif
+      case BLIS_ARCH_ZEN:
+      case BLIS_ARCH_ZEN2:
+      case BLIS_ARCH_ZEN3:
+
+          // AVX2 Kernel
+          axpyv_ker_ptr = bli_daxpyv_zen_int10;
+          break;
+
+      default:
+
+          // Query the context
+          cntx = bli_gks_query_cntx();
+
+          // Query the function pointer using the context
+          axpyv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_AXPYV_KER, cntx);
+    }
+
+#ifdef BLIS_ENABLE_OPENMP
+    /*
+      Initializing the number of thread to one
+      to avoid compiler warnings
+    */
+    dim_t nt = 1;
+
+    /*
+      For the given problem size and architecture, the function
+      returns the optimum number of threads with AOCL dynamic enabled
+      else it returns the number of threads requested by the user.
+    */
+    bli_nthreads_l1
+    (
+      BLIS_AXPYV_KER,
+      BLIS_DOUBLE,
+      BLIS_DOUBLE,
+      arch_id_local,
+      n_elem,
+      &nt
+    );
+
+    if (nt == 1)
+    {
+#endif
+        axpyv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          n_elem,
+          (double *)alpha,
+          x0, incx0,
+          y0, incy0,
+          cntx
+        );
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+
+        return;
+#ifdef BLIS_ENABLE_OPENMP
+    }
+
+    _Pragma("omp parallel num_threads(nt)")
+    {
+        dim_t start, length;
+
+        // Get the thread ID
+        dim_t thread_id = omp_get_thread_num();
+
+        // Get the actual number of threads spawned
+        dim_t nt_use = omp_get_num_threads();
+
+        /*
+          Calculate the compute range for the current thread
+          based on the actual number of threads spawned
+        */
+        bli_thread_vector_partition
+        (
+          n_elem,
+          nt_use,
+          &start, &length,
+          thread_id
+        );
+
+        // Adjust the local pointer for computation
+        double *x_thread_local = x0 + (start * incx0);
+        double *y_thread_local = y0 + (start * incy0);
+
+        // Invoke the function based on the kernel function pointer
+        axpyv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          length,
+          (double *)alpha,
+          x_thread_local, incx0,
+          y_thread_local, incy0,
+          cntx
+        );
+    }
+#endif // BLIS_ENABLE_OPENMP
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+    /* Finalize BLIS. */
+    // bli_finalize_auto();
 }
 
-void caxpy_
+#ifdef BLIS_ENABLE_BLAS
+void daxpy_
+(
+ const f77_int* n,
+ const double*   alpha,
+ const double*   x, const f77_int* incx,
+ double*   y, const f77_int* incy
+ )
+{
+  daxpy_blis_impl( n, alpha, x, incx, y, incy ) ; 
+}
+#endif
+void caxpy_blis_impl
 (
  const f77_int* n,
  const scomplex*   alpha,
@@ -330,9 +505,9 @@ void caxpy_
       incy0 = ( inc_t )(*incy);
     }
 
-  // This function is invoked on all architectures including ‘generic’.
-  // Non-AVX platforms will use the kernels derived from the context.
-  if (bli_cpuid_is_avx_supported() == TRUE)
+  // This function is invoked on all architectures including 'generic'.
+  // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+  if (bli_cpuid_is_avx2fma3_supported() == TRUE)
   {
       bli_caxpyv_zen_int5
       (
@@ -363,8 +538,19 @@ void caxpy_
   /* Finalize BLIS. */
   //    bli_finalize_auto();
 }
-
-void zaxpy_
+#ifdef BLIS_ENABLE_BLAS
+void caxpy_
+(
+ const f77_int* n,
+ const scomplex*   alpha,
+ const scomplex*   x, const f77_int* incx,
+ scomplex*   y, const f77_int* incy
+ )
+{
+  caxpy_blis_impl( n, alpha, x, incx, y, incy ) ; 
+}
+#endif
+void zaxpy_blis_impl
 (
  const f77_int* n,
  const dcomplex*   alpha,
@@ -423,9 +609,9 @@ void zaxpy_
       incy0 = ( inc_t )(*incy);
     }
 
-  // This function is invoked on all architectures including ‘generic’.
-  // Non-AVX platforms will use the kernels derived from the context.
-  if (bli_cpuid_is_avx_supported() == TRUE)
+  // This function is invoked on all architectures including 'generic'.
+  // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+  if (bli_cpuid_is_avx2fma3_supported() == TRUE)
   {
       bli_zaxpyv_zen_int5
       (
@@ -456,7 +642,17 @@ void zaxpy_
   /* Finalize BLIS. */
   //    bli_finalize_auto();
 }
-
+#ifdef BLIS_ENABLE_BLAS
+void zaxpy_
+(
+ const f77_int* n,
+ const dcomplex*   alpha,
+ const dcomplex*   x, const f77_int* incx,
+ dcomplex*   y, const f77_int* incy
+ )
+{
+  zaxpy_blis_impl( n, alpha, x, incx, y, incy ) ; 
+}
 
 
 #endif
diff --git a/frame/compat/bla_copy.c b/frame/compat/bla_copy.c
index 74baba689c..a10ab28a21 100644
--- a/frame/compat/bla_copy.c
+++ b/frame/compat/bla_copy.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -85,8 +85,18 @@ void PASTEF77(ch,blasname) \
 \
 	   /* Finalize BLIS. */ \
 	   bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF77S(ch,blasname)( n, x, incx, y, incy ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS(copy, copyv)
-#endif
diff --git a/frame/compat/bla_copy.h b/frame/compat/bla_copy.h
index 679017b19d..d05d977383 100644
--- a/frame/compat/bla_copy.h
+++ b/frame/compat/bla_copy.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,14 +40,21 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
              ftype*   y, const f77_int* incy \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( copy )
-#endif
 
diff --git a/frame/compat/bla_copy_amd.c b/frame/compat/bla_copy_amd.c
index 8dc4d5287c..d40712321a 100644
--- a/frame/compat/bla_copy_amd.c
+++ b/frame/compat/bla_copy_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -85,11 +85,21 @@ void PASTEF77(ch,blasname) \
 \
 	   /* Finalize BLIS. */ \
 	   bli_finalize_auto(); \
-}
-
-#ifdef BLIS_ENABLE_BLAS
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+             ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF77S(ch,blasname)( n, x, incx, y, incy ); \
+} \
+)
 
-void scopy_
+void scopy_blis_impl
 (
 	const f77_int* n,
 	const float*   x, const f77_int* incx,
@@ -152,9 +162,9 @@ void scopy_
 		incy0 = (inc_t)(*incy);
 	}
 
-	// This function is invoked on all architectures including ‘generic’.
-	// Non-AVX platforms will use the kernels derived from the context.
-	if (bli_cpuid_is_avx_supported() == TRUE)
+	// This function is invoked on all architectures including 'generic'.
+	// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+	if (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	{
 		/* Call BLIS kernel */
 		bli_scopyv_zen_int
@@ -183,8 +193,18 @@ void scopy_
 	/* Finalize BLIS. */
 //    bli_finalize_auto();
 }
-
-void dcopy_
+#ifdef BLIS_ENABLE_BLAS
+void scopy_
+(
+	const f77_int* n,
+	const float*   x, const f77_int* incx,
+	float*   y, const f77_int* incy
+)
+{
+  scopy_blis_impl( n, x, incx, y, incy );
+}
+#endif
+void dcopy_blis_impl
 (
 	const f77_int* n,
 	const double*   x, const f77_int* incx,
@@ -247,9 +267,9 @@ void dcopy_
 		incy0 = (inc_t)(*incy);
 	}
 
-	// This function is invoked on all architectures including ‘generic’.
-	// Non-AVX platforms will use the kernels derived from the context.
-	if (bli_cpuid_is_avx_supported() == TRUE)
+	// This function is invoked on all architectures including 'generic'.
+	// Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+	if (bli_cpuid_is_avx2fma3_supported() == TRUE)
 	{
 		/* Call BLIS kernel */
 		bli_dcopyv_zen_int
@@ -279,7 +299,16 @@ void dcopy_
 	/* Finalize BLIS. */
 //    bli_finalize_auto();
 }
-
+#ifdef BLIS_ENABLE_BLAS
+void dcopy_
+(
+	const f77_int* n,
+	const double*   x, const f77_int* incx,
+	double*   y, const f77_int* incy
+)
+{
+  dcopy_blis_impl( n, x, incx, y, incy );
+}
+#endif
 INSERT_GENTFUNC_BLAS_CZ(copy, copyv)
 
-#endif
diff --git a/frame/compat/bla_dot.c b/frame/compat/bla_dot.c
index 3c4d8c538f..19ae29ccda 100644
--- a/frame/compat/bla_dot.c
+++ b/frame/compat/bla_dot.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-ftype PASTEF772(ch,blasname,chc) \
+ftype PASTEF772S(ch,blasname,chc) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -87,12 +87,22 @@ ftype PASTEF772(ch,blasname,chc) \
     bli_finalize_auto(); \
 \
     return rho; \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+ftype PASTEF772(ch,blasname,chc) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  return PASTEF772S(ch,blasname,chc)( n, x, incx, y, incy );\
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCDOTR_BLAS( dot, dotv )
 
-#ifdef BLIS_ENABLE_BLAS
 #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
 INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
 #else
@@ -100,7 +110,7 @@ INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-void PASTEF772(ch,blasname,chc) \
+void PASTEF772S(ch,blasname,chc) \
      ( \
        ftype*         rhop, \
        const f77_int* n, \
@@ -146,18 +156,30 @@ void PASTEF772(ch,blasname,chc) \
         bli_finalize_auto(); \
 \
         *rhop = rho; \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF772(ch,blasname,chc) \
+     ( \
+       ftype*         rhop, \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF772S(ch,blasname,chc)( rhop, n, x, incx, y, incy );\
+} \
+)
 
 INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
 #endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL
-#endif // BLIS_ENABLE_BLAS
 
 
 // -- "Black sheep" dot product function definitions --
 
 // Input vectors stored in single precision, computed in double precision,
 // with result returned in single precision.
-float PASTEF77(sd,sdot)
+float PASTEF77S(sd,sdot)
      (
        const f77_int* n,
        const float*   sb,
@@ -176,10 +198,22 @@ float PASTEF77(sd,sdot)
              )
            );
 }
+#ifdef BLIS_ENABLE_BLAS
+float PASTEF77(sd,sdot)
+     (
+       const f77_int* n,
+       const float*   sb,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     )
+{
+  return PASTEF77S(sd,sdot)( n, sb, x, incx, y, incy );
+}
+#endif // BLIS_ENABLE_BLAS
 
 // Input vectors stored in single precision, computed in double precision,
 // with result returned in double precision.
-double PASTEF77(d,sdot)
+double PASTEF77S(d,sdot)
      (
        const f77_int* n,
        const float*   x, const f77_int* incx,
@@ -223,5 +257,14 @@ double PASTEF77(d,sdot)
 
     return rho;
 }
-
+#ifdef BLIS_ENABLE_BLAS
+double PASTEF77(d,sdot)
+     (
+       const f77_int* n,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     )
+{
+  return PASTEF77S(d,sdot)( n, x, incx, y, incy );
+}
 #endif // BLIS_ENABLE_BLAS
diff --git a/frame/compat/bla_dot.h b/frame/compat/bla_dot.h
index 16bc3f97cc..ef6ae9b70d 100644
--- a/frame/compat/bla_dot.h
+++ b/frame/compat/bla_dot.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -31,7 +32,6 @@
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
-#ifdef BLIS_ENABLE_BLAS
 
 //
 // Prototype BLAS-to-BLIS interfaces.
@@ -39,7 +39,16 @@
 #undef  GENTPROTDOT
 #define GENTPROTDOT( ftype, ch, chc, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS ftype PASTEF772(ch,blasname,chc) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS ftype PASTEF772S(ch,blasname,chc) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -59,7 +68,17 @@ INSERT_GENTPROTDOTC_BLAS( dot )
 #undef  GENTPROTDOT
 #define GENTPROTDOT( ftype, ch, chc, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF772(ch,blasname,chc) \
+     ( \
+       ftype*         rhop, \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF772S(ch,blasname,chc) \
      ( \
        ftype*         rhop, \
        const f77_int* n, \
@@ -73,6 +92,7 @@ INSERT_GENTPROTDOTC_BLAS( dot )
 
 // -- "Black sheep" dot product function prototypes --
 
+#ifdef BLIS_ENABLE_BLAS
 BLIS_EXPORT_BLAS float PASTEF77(sd,sdot)
      (
        const f77_int* n,
@@ -80,12 +100,26 @@ BLIS_EXPORT_BLAS float PASTEF77(sd,sdot)
        const float*   x, const f77_int* incx,
        const float*   y, const f77_int* incy
      );
-
+#endif
+BLIS_EXPORT_BLAS float PASTEF77S(sd,sdot)
+     (
+       const f77_int* n,
+       const float*   sb,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     );
+     
+#ifdef BLIS_ENABLE_BLAS
 BLIS_EXPORT_BLAS double PASTEF77(d,sdot)
      (
          const f77_int* n,
          const float*   x, const f77_int* incx,
          const float*   y, const f77_int* incy
      );
-
 #endif
+BLIS_EXPORT_BLAS double PASTEF77S(d,sdot)
+     (
+         const f77_int* n,
+         const float*   x, const f77_int* incx,
+         const float*   y, const f77_int* incy
+     );
diff --git a/frame/compat/bla_dot_amd.c b/frame/compat/bla_dot_amd.c
index 0cdaa6535b..0b6651d8fc 100644
--- a/frame/compat/bla_dot_amd.c
+++ b/frame/compat/bla_dot_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,6 +35,17 @@
 
 #include "blis.h"
 
+/*
+  Early return conditions
+  ------------------------
+
+  1. When n <= 0 where n is the length of the vector passed
+
+  NaN propagation expectation
+  --------------------------
+
+  1. Always propagate
+*/
 
 //
 // Define BLAS-to-BLIS interfaces.
@@ -42,7 +53,7 @@
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-ftype PASTEF772(ch,blasname,chc) \
+ftype PASTEF772S(ch,blasname,chc) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -87,10 +98,21 @@ ftype PASTEF772(ch,blasname,chc) \
     bli_finalize_auto(); \
 \
     return rho; \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+ftype PASTEF772(ch,blasname,chc) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  return PASTEF772S(ch,blasname,chc)( n, x, incx, y, incy );\
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
-float sdot_
+float sdot_blis_impl
      (
        const f77_int* n,
        const float*   x, const f77_int* incx,
@@ -107,11 +129,20 @@ float sdot_
     float  rho;
 
     /* Initialize BLIS. */
-//  bli_init_auto();
+    //  bli_init_auto();
 
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
+    // If the vector dimension is less than or equal to zero, return.
+    if (*n <= 0)
+    {
+      rho = 0.0f;
+
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+      return rho;
+    }
+    else
+    {
+      n0 = ( dim_t )(*n);
+    }
 
     /* If the input increments are negative, adjust the pointers so we can
        use positive increments instead. */
@@ -153,45 +184,73 @@ float sdot_
         incy0 = ( inc_t )(*incy);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
-    {
-        /* Call BLIS kernel. */
-        bli_sdotv_zen_int10
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL
-        );
-    }
-    else
+    cntx_t *cntx = NULL;
+
+    // Query the architecture ID
+    arch_t arch_id = bli_arch_query_id();
+
+    /*
+      Function pointer declaration for the function
+      that will be used by this API
+    */
+    sdotv_ker_ft dotv_ker_ptr; // SDOTV
+
+    // Pick the kernel based on the architecture ID
+    switch (arch_id)
     {
-        /* Call BLIS interface. */
-        PASTEMAC2(s,dotv,BLIS_TAPI_EX_SUF)
-        (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL,
-        NULL
-        );
+        case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+
+            // AVX-512 Kernel
+            dotv_ker_ptr = bli_sdotv_zen_int_avx512;
+
+        break;
+#endif
+        case BLIS_ARCH_ZEN:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN3:
+
+            // AVX-2 Kernel
+            dotv_ker_ptr = bli_sdotv_zen_int10;
+
+            break;
+        default:
+
+            // For non-Zen architectures, query the context
+            cntx = bli_gks_query_cntx();
+
+            // Query the context for the kernel function pointers for sdotv
+            dotv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_DOTV_KER, cntx);
     }
 
+    dotv_ker_ptr
+    (
+      BLIS_NO_CONJUGATE,
+      BLIS_NO_CONJUGATE,
+      n0,
+      x0, incx0,
+      y0, incy0,
+      &rho,
+      cntx
+    );
+
     /* Finalize BLIS. */
-//  bli_finalize_auto();
+    //  bli_finalize_auto();
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
     return rho;
 }
-
-double ddot_
+#ifdef BLIS_ENABLE_BLAS
+float sdot_
+     (
+       const f77_int* n,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     )
+{
+  return sdot_blis_impl( n, x, incx, y, incy );
+}
+#endif
+double ddot_blis_impl
      (
        const f77_int* n,
        const double*   x, const f77_int* incx,
@@ -200,19 +259,23 @@ double ddot_
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1);
     AOCL_DTL_LOG_DOTV_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', *n, *incx, *incy);
-    dim_t  n0;
+    dim_t  n_elem;
     double* x0;
     double* y0;
     inc_t  incx0;
     inc_t  incy0;
-    double  rho;
-
-    /* Initialize BLIS. */
-//  bli_init_auto();
+    double  rho = 0.0;
 
-    /* Convert/typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
+    // BLAS Exception: Return early when n <= 0.
+    if((*n) <= 0)
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+        return 0.0;
+    }
+    else
+    {
+        n_elem = ( dim_t )(*n);
+    }
 
     /* If the input increments are negative, adjust the pointers so we can
        use positive increments instead. */
@@ -232,7 +295,7 @@ double ddot_
         pass in the address to the (n-1)th (i.e., the bottom-most or
         right-most) element along with a negative stride. */
 
-        x0    = ((double*)x) + (n0-1)*(-*incx);
+        x0    = ((double*)x) + (n_elem-1)*(-*incx);
         incx0 = ( inc_t )(*incx);
 
     }
@@ -244,7 +307,7 @@ double ddot_
 
     if ( *incy < 0 )
     {
-        y0    = ((double*)y) + (n0-1)*(-*incy);
+        y0    = ((double*)y) + (n_elem-1)*(-*incy);
         incy0 = ( inc_t )(*incy);
 
     }
@@ -254,46 +317,235 @@ double ddot_
         incy0 = ( inc_t )(*incy);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+     // Definition of function pointer
+    ddotv_ker_ft dotv_ker_ptr;
+
+    cntx_t *cntx = NULL;
+
+    // Query the architecture ID
+    arch_t arch_id_local = bli_arch_query_id();
+
+    // Pick the kernel based on the architecture ID
+    switch (arch_id_local)
+    {
+      case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+
+        // AVX-512 Kernel
+        dotv_ker_ptr = bli_ddotv_zen_int_avx512;
+        break;
+#endif
+      case BLIS_ARCH_ZEN:
+      case BLIS_ARCH_ZEN2:
+      case BLIS_ARCH_ZEN3:
+
+          // AVX2 Kernel
+          dotv_ker_ptr = bli_ddotv_zen_int10;
+          break;
+
+      default:
+
+          // Query the context
+          cntx = bli_gks_query_cntx();
+
+          // Query the function pointer using the context
+          dotv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_DOTV_KER, cntx);
+    }
+
+#ifdef BLIS_ENABLE_OPENMP
+    /*
+      Initializing the number of thread to one
+      to avoid compiler warnings
+    */
+    dim_t nt = 1;
+
+    /*
+      For the given problem size and architecture, the function
+      returns the optimum number of threads with AOCL dynamic enabled
+      else it returns the number of threads requested by the user.
+    */
+    bli_nthreads_l1
+    (
+      BLIS_DOTV_KER,
+      BLIS_DOUBLE,
+      BLIS_DOUBLE,
+      arch_id_local,
+      n_elem,
+      &nt
+    );
+
+    /*
+      If the number of optimum threads is 1, the OpenMP overhead
+      is avoided by calling the function directly
+    */
+    if (nt == 1)
     {
-        /* Call BLIS kernel. */
-        bli_ddotv_zen_int10
+#endif
+        dotv_ker_ptr
         (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL
+          BLIS_NO_CONJUGATE,
+          BLIS_NO_CONJUGATE,
+          n_elem,
+          x0, incx0,
+          y0, incy0,
+          &rho,
+          cntx
         );
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+
+        return rho;
+#ifdef BLIS_ENABLE_OPENMP
+    }
+
+    /*
+      Here we know that more than one thread needs to be spawned.
+
+      In such a case, each thread will need its own rho value to
+      do the accumulation. These temporary rho's will be accumulated
+      in the end.
+    */
+    rntm_t rntm;
+    mem_t mem_buf_rho;
+    double *rho_temp = NULL;
+    rho = 0.0;
+
+    /*
+      Initialize mem pool buffer to NULL and size to 0
+      "buf" and "size" fields are assigned once memory
+      is allocated from the pool in bli_membrk_acquire_m().
+      This will ensure bli_mem_is_alloc() will be passed on
+      an allocated memory if created or a NULL .
+    */
+    mem_buf_rho.pblk.buf = NULL;
+    mem_buf_rho.pblk.block_size = 0;
+    mem_buf_rho.buf_type = 0;
+    mem_buf_rho.size = 0;
+    mem_buf_rho.pool = NULL;
+
+    /*
+        In order to get the buffer from pool via rntm access to
+        memory broker is needed.Following are initializations
+        for rntm
+    */
+    bli_rntm_init_from_global(&rntm);
+    bli_rntm_set_num_threads_only(1, &rntm);
+    bli_membrk_rntm_set_membrk(&rntm);
+
+    // Calculate the size required for rho buffer.
+    size_t buffer_size = nt * sizeof(double);
+
+#ifdef BLIS_ENABLE_MEM_TRACING
+    printf("bli_ddotv_unf_var1(): get mem pool block\n");
+#endif
+
+    /*
+      Acquire a buffer (nt * size(double)) from the memory broker
+      and save the associated mem_t entry to mem_buf_rho.
+    */
+    bli_membrk_acquire_m(&rntm,
+                         buffer_size,
+                         BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
+                         &mem_buf_rho);
+
+    /* Continue if rho buffer memory is allocated*/
+    if ((bli_mem_is_alloc(&mem_buf_rho)))
+    {
+        rho_temp = bli_mem_buffer(&mem_buf_rho);
+
+        /*
+          This is done to handle cases when the
+          number of threads launched is not equal
+          to the number of threads requested. In
+          such cases, the garbage value in the created
+          buffer will not be overwritten by valid values.
+
+          This will ensure that garbage value will
+          not get accumulated with the final result.
+        */
+        for (dim_t i = 0; i < nt; i++)
+          rho_temp[i] = 0.0;
     }
     else
     {
-        /* Call BLIS interface. */
-        PASTEMAC2(d,dotv,BLIS_TAPI_EX_SUF)
+        nt = 1;
+        rho_temp = &rho;
+    }
+
+    _Pragma("omp parallel num_threads(nt)")
+    {
+        dim_t start, length;
+
+        // Get the thread ID
+        dim_t thread_id = omp_get_thread_num();
+
+        // Get the actual number of threads spawned
+        dim_t nt_use = omp_get_num_threads();
+
+        /*
+          Calculate the compute range for the current thread
+          based on the actual number of threads spawned
+        */
+        bli_thread_vector_partition
         (
-        BLIS_NO_CONJUGATE,
-        BLIS_NO_CONJUGATE,
-        n0,
-        x0, incx0,
-        y0, incy0,
-        &rho,
-        NULL,
-        NULL
+          n_elem,
+          nt_use,
+          &start, &length,
+          thread_id
         );
+
+        // Adjust the local pointer for computation
+        double *x_thread_local = x0 + (start * incx0);
+        double *y_thread_local = y0 + (start * incy0);
+
+        // Invoke the function based on the kernel function pointer
+        dotv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          BLIS_NO_CONJUGATE,
+          length,
+          x_thread_local, incx0,
+          y_thread_local, incy0,
+          rho_temp + thread_id,
+          cntx
+        );
+    }
+
+    /*
+      Accumulate the values in rho_temp only when mem is allocated.
+      When the memory cannot be allocated rho_temp will point to
+      rho
+    */
+    if (bli_mem_is_alloc(&mem_buf_rho))
+    {
+        // Accumulating the nt thread outputs to rho
+        for (dim_t i = 0; i < nt; i++)
+          rho += rho_temp[i];
+
+        // Releasing the allocated memory if it was allocated
+        bli_membrk_release(&rntm, &mem_buf_rho);
     }
+#endif
 
     /* Finalize BLIS. */
-//  bli_finalize_auto();
+    //  bli_finalize_auto();
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
     return rho;
 }
+#ifdef BLIS_ENABLE_BLAS
+double ddot_
+     (
+       const f77_int* n,
+       const double*   x, const f77_int* incx,
+       const double*   y, const f77_int* incy
+     )
+{
+  return ddot_blis_impl( n, x, incx, y, incy );
+}
+#endif
 
 #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
-scomplex cdotu_
+scomplex cdotu_blis_impl
      (
        const f77_int* n,
        const scomplex*   x, const f77_int* incx,
@@ -356,9 +608,9 @@ scomplex cdotu_
         incy0 = ( inc_t )(*incy);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
         /* Call BLIS kernel. */
         bli_cdotv_zen_int5
@@ -393,8 +645,18 @@ scomplex cdotu_
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
     return rho;
 }
-
-dcomplex zdotu_
+#ifdef BLIS_ENABLE_BLAS
+scomplex cdotu_
+     (
+       const f77_int* n,
+       const scomplex*   x, const f77_int* incx,
+       const scomplex*   y, const f77_int* incy
+     )
+{
+  return cdotu_blis_impl( n, x, incx, y, incy );
+}
+#endif
+dcomplex zdotu_blis_impl
      (
        const f77_int* n,
        const dcomplex*   x, const f77_int* incx,
@@ -458,9 +720,9 @@ dcomplex zdotu_
         incy0 = ( inc_t )(*incy);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
         /* Call BLIS kernel. */
         bli_zdotv_zen_int5
@@ -497,9 +759,18 @@ dcomplex zdotu_
 
     return rho;
 }
-
-
-scomplex cdotc_
+#ifdef BLIS_ENABLE_BLAS
+dcomplex zdotu_
+     (
+       const f77_int* n,
+       const dcomplex*   x, const f77_int* incx,
+       const dcomplex*   y, const f77_int* incy
+     )
+{
+  return zdotu_blis_impl( n, x, incx, y, incy );
+}
+#endif
+scomplex cdotc_blis_impl
      (
        const f77_int* n,
        const scomplex*   x, const f77_int* incx,
@@ -563,9 +834,9 @@ scomplex cdotc_
         incy0 = ( inc_t )(*incy);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
         /* Call BLIS kernel. */
         bli_cdotv_zen_int5
@@ -601,8 +872,18 @@ scomplex cdotc_
 
     return rho;
 }
-
-dcomplex zdotc_
+#ifdef BLIS_ENABLE_BLAS
+scomplex cdotc_
+     (
+       const f77_int* n,
+       const scomplex*   x, const f77_int* incx,
+       const scomplex*   y, const f77_int* incy
+     )
+{
+  return cdotc_blis_impl( n, x, incx, y, incy );
+}
+#endif
+dcomplex zdotc_blis_impl
      (
        const f77_int* n,
        const dcomplex*   x, const f77_int* incx,
@@ -665,9 +946,9 @@ dcomplex zdotc_
         incy0 = ( inc_t )(*incy);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
         /* Call BLIS kernel. */
         bli_zdotv_zen_int5
@@ -697,10 +978,6 @@ dcomplex zdotc_
         );
     }
 
-
-
-
-
     /* Finalize BLIS. */
 //  bli_finalize_auto();
 
@@ -708,13 +985,24 @@ dcomplex zdotc_
 
     return rho;
 }
+#ifdef BLIS_ENABLE_BLAS
+dcomplex zdotc_
+     (
+       const f77_int* n,
+       const dcomplex*   x, const f77_int* incx,
+       const dcomplex*   y, const f77_int* incy
+     )
+{
+  return zdotc_blis_impl( n, x, incx, y, incy );
+}
+#endif
 
 #else // BLIS_DISABLE_COMPLEX_RETURN_INTEL
 // For the "intel" complex return type, use a hidden parameter to return the result
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-void PASTEF772(ch,blasname,chc) \
+void PASTEF772S(ch,blasname,chc) \
      ( \
        ftype*         rhop, \
        const f77_int* n, \
@@ -760,18 +1048,29 @@ void PASTEF772(ch,blasname,chc) \
         bli_finalize_auto(); \
 \
         *rhop = rho; \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF772(ch,blasname,chc) \
+     ( \
+       ftype*         rhop, \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF772S(ch,blasname,chc)( rhop, n, x, incx, y, incy );\
+} \
+)
 
 INSERT_GENTFUNCDOTC_BLAS( dot, dotv )
 #endif // BLIS_DISABLE_COMPLEX_RETURN_INTEL
 
-
-
 // -- "Black sheep" dot product function definitions --
 
 // Input vectors stored in single precision, computed in double precision,
 // with result returned in single precision.
-float PASTEF77(sd,sdot)
+float PASTEF77S(sd,sdot)
      (
        const f77_int* n,
        const float*   sb,
@@ -782,7 +1081,7 @@ float PASTEF77(sd,sdot)
     return ( float )
            (
              ( double )(*sb) +
-             PASTEF77(d,sdot)
+             PASTEF77S(d,sdot)
              (
                n,
                x, incx,
@@ -790,10 +1089,22 @@ float PASTEF77(sd,sdot)
              )
            );
 }
+#ifdef BLIS_ENABLE_BLAS
+float PASTEF77(sd,sdot)
+     (
+       const f77_int* n,
+       const float*   sb,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     )
+{
+  return PASTEF77S(sd,sdot)( n,sb, x, incx, y, incy );
+}
+#endif
 
 // Input vectors stored in single precision, computed in double precision,
 // with result returned in double precision.
-double PASTEF77(d,sdot)
+double PASTEF77S(d,sdot)
      (
        const f77_int* n,
        const float*   x, const f77_int* incx,
@@ -838,4 +1149,14 @@ double PASTEF77(d,sdot)
     return rho;
 }
 
-#endif
+#ifdef BLIS_ENABLE_BLAS
+double PASTEF77(d,sdot)
+     (
+       const f77_int* n,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy
+     )
+{
+  return PASTEF77S(d,sdot)( n, x, incx, y, incy );
+}
+#endif // BLIS_ENABLE_BLAS
diff --git a/frame/compat/bla_gemm.c b/frame/compat/bla_gemm.c
index 931c80243a..e7576096cd 100644
--- a/frame/compat/bla_gemm.c
+++ b/frame/compat/bla_gemm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -44,7 +44,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_char* transb, \
@@ -97,6 +97,32 @@ void PASTEF77(ch,blasname) \
 	  bli_finalize_auto(); \
 	  return; \
 	} \
+\
+	/* If alpha is zero scale C by beta and return early. */ \
+	if( PASTEMAC(ch,eq0)( *alpha )) \
+	{ \
+	  bli_convert_blas_dim1(*m, m0); \
+	  bli_convert_blas_dim1(*n, n0); \
+	  const inc_t rs_c = 1; \
+	  const inc_t cs_c = *ldc; \
+\
+	  PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+	              0, \
+	              BLIS_NONUNIT_DIAG, \
+	              BLIS_DENSE, \
+	              m0, \
+	              n0, \
+	              (ftype*) beta, \
+	              (ftype*) c, rs_c, cs_c, \
+	              NULL, NULL \
+	            ); \
+\
+	  AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+	  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+	  /* Finalize BLIS. */ \
+	  bli_finalize_auto(); \
+	  return; \
+	} \
 \
 	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
 	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
@@ -132,18 +158,36 @@ void PASTEF77(ch,blasname) \
 	  NULL  \
 	); \
 \
-	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);\
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
 	/* Finalize BLIS. */				 \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+	     ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_char* transb, \
@@ -197,6 +241,32 @@ void PASTEF77(ch,blasname) \
 	  bli_finalize_auto(); \
 	  return; \
 	} \
+\
+	/* If alpha is zero scale C by beta and return early. */ \
+	if( PASTEMAC(ch,eq0)( *alpha )) \
+	{ \
+	  bli_convert_blas_dim1(*m, m0); \
+	  bli_convert_blas_dim1(*n, n0); \
+	  const inc_t rs_c = 1; \
+	  const inc_t cs_c = *ldc; \
+\
+	  PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+                   0, \
+                   BLIS_NONUNIT_DIAG, \
+                   BLIS_DENSE, \
+                   m0, \
+                   n0, \
+                   (ftype*) beta, \
+                   (ftype*) c, rs_c, cs_c, \
+                   NULL, NULL \
+                 ); \
+\
+	  AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+	  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+	  /* Finalize BLIS. */ \
+	  bli_finalize_auto(); \
+	  return; \
+	} \
 \
 	/* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
 	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
@@ -246,6 +316,9 @@ void PASTEF77(ch,blasname) \
 					); \
 		} \
 		AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+		/* Finalize BLIS. */ \
+  		bli_finalize_auto(); \
 		return; \
 	} \
 	else if( m0 == 1 ) \
@@ -279,6 +352,9 @@ void PASTEF77(ch,blasname) \
 					); \
 		} \
 		AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+		/* Finalize BLIS. */ \
+  		bli_finalize_auto(); \
 		return; \
 	} \
 \
@@ -315,10 +391,28 @@ void PASTEF77(ch,blasname) \
 	); \
 \
 	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
 	/* Finalize BLIS. */				 \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+	     ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 #endif
 
 #ifdef BLIS_ENABLE_BLAS
@@ -376,6 +470,32 @@ void dzgemm_
 	  return;
 	}
 
+	/* If alpha is zero scale C by beta and return early. */
+	if( PASTEMAC(z,eq0)( *alpha ))
+	{
+	  bli_convert_blas_dim1(*m, m0);
+	  bli_convert_blas_dim1(*n, n0);
+	  const inc_t rs_c = 1;
+	  const inc_t cs_c = *ldc;
+
+	  PASTEMAC2(z,scalm,_ex)( BLIS_NO_CONJUGATE,
+	            0,
+	            BLIS_NONUNIT_DIAG,
+	            BLIS_DENSE,
+	            m0,
+	            n0,
+	            (dcomplex*) beta,
+	            (dcomplex*) c, rs_c, cs_c,
+	            NULL, NULL
+	  );
+
+	  AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+	  /* Finalize BLIS. */
+	  bli_finalize_auto();
+	  return;
+	}
+
 	/* Map BLAS chars to their corresponding BLIS enumerated type value. */
 	bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
 	bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
@@ -418,11 +538,12 @@ void dzgemm_
 	bli_obj_set_conjtrans( blis_transa, &ao );
 	bli_obj_set_conjtrans( blis_transb, &bo );
 
-		// fall back on native path when zgemm is not handled in sup path.
+	// fall back on native path when zgemm is not handled in sup path.
 	bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
 
 
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+	AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
 	/* Finalize BLIS. */
 	bli_finalize_auto();
 }// end of dzgemm_
diff --git a/frame/compat/bla_gemm.h b/frame/compat/bla_gemm.h
index c9ea83149a..8956387c54 100644
--- a/frame/compat/bla_gemm.h
+++ b/frame/compat/bla_gemm.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,22 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_char* transb, \
@@ -54,8 +69,9 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
+INSERT_GENTPROT_BLAS( gemm )
+
 #ifdef BLIS_ENABLE_BLAS
-#if 1
 BLIS_EXPORT_BLAS void dzgemm_
      (
        const f77_char* transa, \
@@ -69,8 +85,22 @@ BLIS_EXPORT_BLAS void dzgemm_
        const dcomplex*    beta, \
              dcomplex*    c, const f77_int* ldc  \
      );
-#endif
-INSERT_GENTPROT_BLAS( gemm )
 
 #endif
+BLIS_EXPORT_BLAS void dzgemm_blis_impl
+     (
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const dcomplex*    alpha, \
+       const double*    a, const f77_int* lda, \
+       const dcomplex*    b, const f77_int* ldb, \
+       const dcomplex*    beta, \
+             dcomplex*    c, const f77_int* ldc  \
+     );
+
+
+
 
diff --git a/frame/compat/bla_gemm3m.c b/frame/compat/bla_gemm3m.c
index 665c8643dd..612e8631c1 100644
--- a/frame/compat/bla_gemm3m.c
+++ b/frame/compat/bla_gemm3m.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -44,7 +44,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_char* transb, \
@@ -131,14 +131,32 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     /* Finalize BLIS. */                 \
     bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_char* transb, \
@@ -240,11 +258,27 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
     /* Finalize BLIS. */                 \
     bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS_CZ( gemm3m, gemm )
-#endif
 
diff --git a/frame/compat/bla_gemm3m.h b/frame/compat/bla_gemm3m.h
index 1063d85c03..6b33e40319 100644
--- a/frame/compat/bla_gemm3m.h
+++ b/frame/compat/bla_gemm3m.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +39,22 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_char* transb, \
@@ -53,7 +68,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
-INSERT_GENTPROT_BLAS( gemm3m )
-#endif
+INSERT_GENTPROT_BLAS_CZ( gemm3m )
 
diff --git a/frame/compat/bla_gemm_amd.c b/frame/compat/bla_gemm_amd.c
index a9478581ef..afbecd2a58 100644
--- a/frame/compat/bla_gemm_amd.c
+++ b/frame/compat/bla_gemm_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -44,7 +44,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_char* transb, \
@@ -97,6 +97,32 @@ void PASTEF77(ch,blasname) \
         bli_finalize_auto(); \
         return; \
     } \
+\
+    /* If alpha is zero scale C by beta and return early. */ \
+    if( PASTEMAC(ch,eq0)( *alpha )) \
+    { \
+        bli_convert_blas_dim1(*m, m0); \
+        bli_convert_blas_dim1(*n, n0); \
+        const inc_t rs_c = 1; \
+        const inc_t cs_c = *ldc; \
+\
+        PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+                   0, \
+                   BLIS_NONUNIT_DIAG, \
+                   BLIS_DENSE, \
+                   m0, \
+                   n0, \
+                   (ftype*) beta, \
+                   (ftype*) c, rs_c, cs_c, \
+                   NULL, NULL \
+                 ); \
+\
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+        /* Finalize BLIS. */ \
+        bli_finalize_auto(); \
+        return; \
+    } \
 \
     /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
     bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
@@ -132,18 +158,37 @@ void PASTEF77(ch,blasname) \
       NULL  \
     ); \
 \
-    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);\
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
     /* Finalize BLIS. */                 \
     bli_finalize_auto(); \
-}
+} \
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+         ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_char* transb, \
@@ -197,6 +242,32 @@ void PASTEF77(ch,blasname) \
         bli_finalize_auto(); \
         return; \
     } \
+\
+    /* If alpha is zero scale C by beta and return early. */ \
+    if( PASTEMAC(ch,eq0)( *alpha )) \
+    { \
+        bli_convert_blas_dim1(*m, m0); \
+        bli_convert_blas_dim1(*n, n0); \
+        const inc_t rs_c = 1; \
+        const inc_t cs_c = *ldc; \
+\
+        PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+                   0, \
+                   BLIS_NONUNIT_DIAG, \
+                   BLIS_DENSE, \
+                   m0, \
+                   n0, \
+                   (ftype*) beta, \
+                   (ftype*) c, rs_c, cs_c, \
+                   NULL, NULL \
+                 ); \
+\
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+        /* Finalize BLIS. */ \
+        bli_finalize_auto(); \
+        return; \
+    } \
 \
     /* Map BLAS chars to their corresponding BLIS enumerated type value. */ \
     bli_param_map_netlib_to_blis_trans( *transa, &blis_transa ); \
@@ -246,6 +317,9 @@ void PASTEF77(ch,blasname) \
                     ); \
         } \
         AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+        /* Finalize BLIS. */ \
+        bli_finalize_auto(); \
         return; \
     } \
     else if( m0 == 1 ) \
@@ -279,6 +353,9 @@ void PASTEF77(ch,blasname) \
                     ); \
         } \
         AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
+        /* Finalize BLIS. */ \
+        bli_finalize_auto(); \
         return; \
     } \
 \
@@ -315,14 +392,33 @@ void PASTEF77(ch,blasname) \
     ); \
 \
     AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k); \
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
     /* Finalize BLIS. */                 \
     bli_finalize_auto(); \
-}
+} \
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+         ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
+
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
-void dgemm_
+void dgemm_blis_impl
 (
     const f77_char* transa,
     const f77_char* transb,
@@ -336,46 +432,69 @@ void dgemm_
     double* c, const f77_int* ldc
 )
 {
+    trans_t blis_transa;
+    trans_t blis_transb;
+    dim_t   m0, n0, k0;
 
+    /* Initialize BLIS. */
+    bli_init_auto();
 
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
+                             (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
 
-  trans_t blis_transa;
-  trans_t blis_transb;
-  dim_t   m0, n0, k0;
-
-  /* Initialize BLIS. */
-  bli_init_auto();
+    /* Perform BLAS parameter checking. */
+    PASTEBLACHK(gemm)
+      (
+       MKSTR(d),
+       MKSTR(gemm),
+       transa,
+       transb,
+       m,
+       n,
+       k,
+       lda,
+       ldb,
+       ldc
+      );
 
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(d), *transa, *transb, *m, *n, *k, \
-                           (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
+    /* Quick return if possible. */
+    if ( *m == 0 || *n == 0 || ((*alpha == 0.0 || *k == 0) && *beta == 1.0))
+    {
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        bli_finalize_auto();
+        return;
+    }
 
-  /* Perform BLAS parameter checking. */
-  PASTEBLACHK(gemm)
-    (
-     MKSTR(d),
-     MKSTR(gemm),
-     transa,
-     transb,
-     m,
-     n,
-     k,
-     lda,
-     ldb,
-     ldc
-    );
+    /* If alpha is zero scale C by beta and return early. */
+    if( PASTEMAC(d,eq0)( *alpha ))
+    {
+        bli_convert_blas_dim1(*m, m0);
+        bli_convert_blas_dim1(*n, n0);
+        const inc_t rs_c = 1;
+        const inc_t cs_c = *ldc;
+
+        PASTEMAC2(d,scalm,_ex)( BLIS_NO_CONJUGATE,
+                   0,
+                   BLIS_NONUNIT_DIAG,
+                   BLIS_DENSE,
+                   m0,
+                   n0,
+                   (double*) beta,
+                   (double*) c, rs_c, cs_c,
+                   NULL, NULL
+                 );
 
-  /* Quick return if possible. */
-  if ( *m == 0 || *n == 0 || ((*alpha == 0.0 || *k == 0) && *beta == 1.0))
-  {
-      AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-      /* Finalize BLIS. */
-      bli_finalize_auto();
-      return;
-  }
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        bli_finalize_auto();
+        return;
+    }
 
-    /* Map BLAS chars to their corresponding BLIS enumerated type value. */
+  /* Map BLAS chars to their corresponding BLIS enumerated type value. */
   bli_param_map_netlib_to_blis_trans(*transa, &blis_transa);
   bli_param_map_netlib_to_blis_trans(*transb, &blis_transb);
 
@@ -393,9 +512,9 @@ void dgemm_
     const inc_t rs_c = 1;
     const inc_t cs_c = *ldc;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         // This code is duplicated below, however we don't want to move it out of
         // this IF block as it will affect the performance on Zen architetures
@@ -437,7 +556,6 @@ void dgemm_
         );
 
         AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
         /* Finalize BLIS. */
         bli_finalize_auto();
@@ -446,88 +564,92 @@ void dgemm_
 
     if((k0 == 1) && bli_is_notrans(blis_transa) && bli_is_notrans(blis_transb))
     {
-    bli_dgemm_ref_k1_nn( m0, n0, k0,
-              (double*)alpha,
-              (double*)a, *lda,
-              (double*)b, *ldb,
-              (double*)beta,
-              c, *ldc
-            );
-    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-    /* Finalize BLIS */
-    bli_finalize_auto();
-
-    return;
+        bli_dgemm_8x6_avx2_k1_nn( m0, n0, k0,
+                  (double*)alpha,
+                  (double*)a, *lda,
+                  (double*)b, *ldb,
+                  (double*)beta,
+                  c, *ldc
+                );
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS */
+        bli_finalize_auto();
+        return;
     }
 
     if (n0 == 1)
     {
-    if (bli_is_notrans(blis_transa))
-    {
-        bli_dgemv_unf_var2(
-        BLIS_NO_TRANSPOSE,
-        bli_extract_conj(blis_transb),
-        m0, k0,
-        (double*)alpha,
-        (double*)a, rs_a, cs_a,
-        (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
-        (double*)beta,
-        c, rs_c,
-        ((void*)0)
-        );
-    }
-    else
-    {
-        bli_dgemv_unf_var1(
-        blis_transa,
-        bli_extract_conj(blis_transb),
-        k0, m0,
-        (double*)alpha,
-        (double*)a, rs_a, cs_a,
-        (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
-        (double*)beta,
-        c, rs_c,
-        ((void*)0)
-        );
-    }
-
-    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        if (bli_is_notrans(blis_transa))
+        {
+            bli_dgemv_unf_var2(
+            BLIS_NO_TRANSPOSE,
+            bli_extract_conj(blis_transb),
+            m0, k0,
+            (double*)alpha,
+            (double*)a, rs_a, cs_a,
+            (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
+            (double*)beta,
+            c, rs_c,
+            ((void*)0)
+            );
+        }
+        else
+        {
+            bli_dgemv_unf_var1(
+            blis_transa,
+            bli_extract_conj(blis_transb),
+            k0, m0,
+            (double*)alpha,
+            (double*)a, rs_a, cs_a,
+            (double*)b, bli_is_notrans(blis_transb) ? rs_b : cs_b,
+            (double*)beta,
+            c, rs_c,
+            ((void*)0)
+            );
+        }
 
-    return;
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS */
+        bli_finalize_auto();
+        return;
     }
     else if (m0 == 1)
     {
-    if (bli_is_notrans(blis_transb))
-    {
-        bli_dgemv_unf_var1(
-        blis_transb,
-        bli_extract_conj(blis_transa),
-        n0, k0,
-        (double*)alpha,
-        (double*)b, cs_b, rs_b,
-        (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
-        (double*)beta,
-        c, cs_c,
-        ((void*)0)
-        );
-    }
-    else
-    {
-        bli_dgemv_unf_var2(
-        blis_transb,
-        bli_extract_conj(blis_transa),
-        k0, n0,
-        (double*)alpha,
-        (double*)b, cs_b, rs_b,
-        (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
-        (double*)beta,
-        c, cs_c,
-        ((void*)0)
-        );
-    }
-    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-    return;
+        if (bli_is_notrans(blis_transb))
+        {
+            bli_dgemv_unf_var1(
+            blis_transb,
+            bli_extract_conj(blis_transa),
+            n0, k0,
+            (double*)alpha,
+            (double*)b, cs_b, rs_b,
+            (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
+            (double*)beta,
+            c, cs_c,
+            ((void*)0)
+            );
+        }
+        else
+        {
+            bli_dgemv_unf_var2(
+            blis_transb,
+            bli_extract_conj(blis_transa),
+            k0, n0,
+            (double*)alpha,
+            (double*)b, cs_b, rs_b,
+            (double*)a, bli_is_notrans(blis_transa) ? cs_a : rs_a,
+            (double*)beta,
+            c, cs_c,
+            ((void*)0)
+            );
+        }
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS */
+        bli_finalize_auto();
+        return;
     }
 
     const num_t dt = BLIS_DOUBLE;
@@ -556,35 +678,35 @@ void dgemm_
 
     //cntx_t* cntx = bli_gks_query_cntx();
     //dim_t nt = bli_thread_get_num_threads(); // get number of threads
-    bool nt = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
+    bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel dgemm is invoked.
 
 #ifdef AOCL_DYNAMIC
-    //For smaller sizes dgemm_small is perfoming better
-    if (nt && (((m0 >32) || (n0>32) || (k0>32)) && ((m0+n0+k0)>150)) )
+    //For smaller sizes dgemm_small is performing better
+    if (is_parallel && (((m0 >32) || (n0>32) || (k0>32)) && ((m0+n0+k0)>150)) )
 #else
-    if (nt)
+    if (is_parallel)
 #endif
     {
-    // Will call parallelized dgemm code - sup & native
-    PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
-    (
-        &alphao,
-        &ao,
-        &bo,
-        &betao,
-        &co,
-        NULL,
-        NULL
-        );
-    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        // Will call parallelized dgemm code - sup & native
+        PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
+        (
+            &alphao,
+            &ao,
+            &bo,
+            &betao,
+            &co,
+            NULL,
+            NULL
+            );
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
 
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-    /* Finalize BLIS. */
-    bli_finalize_auto();
-    return;
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        bli_finalize_auto();
+        return;
     }
 
-    // The code below will be called when number of threads = 1.
+// The code below will be called when number of threads = 1.
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
 
@@ -623,7 +745,6 @@ void dgemm_
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
         /* Finalize BLIS. */
         bli_finalize_auto();
-
         return;
       }
       }
@@ -634,6 +755,9 @@ void dgemm_
     if (status == BLIS_SUCCESS)
     {
         AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS */
+        bli_finalize_auto();
         return;
     }
 
@@ -657,8 +781,25 @@ void dgemm_
     /* Finalize BLIS. */
     bli_finalize_auto();
 } // end of dgemm_
-
-void zgemm_
+#ifdef BLIS_ENABLE_BLAS
+void dgemm_
+(
+    const f77_char* transa,
+    const f77_char* transb,
+    const f77_int* m,
+    const f77_int* n,
+    const f77_int* k,
+    const double* alpha,
+    const double* a, const f77_int* lda,
+    const double* b, const f77_int* ldb,
+    const double* beta,
+    double* c, const f77_int* ldc
+)
+{
+    dgemm_blis_impl(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+#endif
+void zgemm_blis_impl
      (
        const f77_char* transa,
        const f77_char* transb,
@@ -672,18 +813,18 @@ void zgemm_
              dcomplex*    c, const f77_int* ldc
      )
 {
-  trans_t blis_transa;
-  trans_t blis_transb;
-  dim_t   m0, n0, k0;
+    trans_t blis_transa;
+    trans_t blis_transb;
+    dim_t   m0, n0, k0;
 
-  /* Initialize BLIS. */
-  bli_init_auto();
+    /* Initialize BLIS. */
+    bli_init_auto();
 
-  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
-  AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_GEMM_INPUTS(AOCL_DTL_LEVEL_TRACE_1, *MKSTR(z), *transa, *transb, *m, *n, *k,
         (void*)alpha, *lda, *ldb, (void*)beta, *ldc);
 
-  /* Perform BLAS parameter checking. */
+    /* Perform BLAS parameter checking. */
     PASTEBLACHK(gemm)
     (
       MKSTR(z),
@@ -709,6 +850,32 @@ void zgemm_
         return;
     }
 
+    /* If alpha is zero scale C by beta and return early. */
+    if( PASTEMAC(z,eq0)( *alpha ))
+    {
+        bli_convert_blas_dim1(*m, m0);
+        bli_convert_blas_dim1(*n, n0);
+        const inc_t rs_c = 1;
+        const inc_t cs_c = *ldc;
+
+        PASTEMAC2(z,scalm,_ex)( BLIS_NO_CONJUGATE,
+                   0,
+                   BLIS_NONUNIT_DIAG,
+                   BLIS_DENSE,
+                   m0,
+                   n0,
+                   (dcomplex*) beta,
+                   (dcomplex*) c, rs_c, cs_c,
+                   NULL, NULL
+                 );
+
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        bli_finalize_auto();
+        return;
+    }
+
     /* Map BLAS chars to their corresponding BLIS enumerated type value. */
     bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
     bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
@@ -750,23 +917,47 @@ void zgemm_
     bli_obj_set_conjtrans( blis_transa, &ao );
     bli_obj_set_conjtrans( blis_transb, &bo );
 
-    // default instance peformance tuning is done in zgemm.
+    // default instance performance tuning is done in zgemm.
     // Single instance tuning is done based on env set.
     //dim_t single_instance = bli_env_get_var( "BLIS_SINGLE_INSTANCE", -1 );
 
     //dim_t nt = bli_thread_get_num_threads(); // get number of threads
-    bool nt = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
+    bool is_parallel = bli_thread_get_is_parallel(); // Check if parallel zgemm is invoked.
+
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
+    {
+
+        // Will call parallelized zgemm code - sup & native
+        PASTEMAC(gemm, BLIS_OAPI_EX_SUF)
+        (
+            &alphao,
+            &ao,
+            &bo,
+            &betao,
+            &co,
+            NULL,
+            NULL
+        );
+
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        bli_finalize_auto();
+        return;
+    }
 
     /*
     Invoking the API for input sizes with k=1.
     - For single thread, the API has no constraints before invoking.
     - For multiple threads, the constraint is that m and n should individually be less than 128.
     */
-    if((k0 == 1) && ((nt == 0) || ((nt == 1) && (m0 < 128) && (n0 < 128)))
+    if((k0 == 1) && ((!is_parallel) || ((is_parallel) && (m0 < 128) && (n0 < 128)))
         && bli_is_notrans(blis_transa)
         && bli_is_notrans(blis_transb))
     {
-        bli_zgemm_ref_k1_nn( m0, n0, k0,
+        bli_zgemm_4x6_avx2_k1_nn( m0, n0, k0,
                             (dcomplex*)alpha,
                             (dcomplex*)a, *lda,
                             (dcomplex*)b, *ldb,
@@ -776,7 +967,6 @@ void zgemm_
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
         /* Finalize BLIS */
         bli_finalize_auto();
-
         return;
     }
 
@@ -796,7 +986,9 @@ void zgemm_
                 c, rs_c,
                 ((void *)0));
             AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-
+            AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+            /* Finalize BLIS. */
+            bli_finalize_auto();
             return;
         }
     }
@@ -815,14 +1007,17 @@ void zgemm_
                 c, cs_c,
                 ((void *)0));
             AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+            AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+            /* Finalize BLIS. */
+            bli_finalize_auto();
             return;
         }
     }
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
 
-    if (((nt == 0) && ((m0 <= 512) && (n0 <= 512) && (k0 <= 512))) ||
-        ((nt == 1) && (((m0 <= 32) || (n0 <= 32) || (k0 <= 32)) && ((m0 + n0 + k0) <= 100))))
+    if (((!is_parallel) && (((m0*k0) <= 16384) || ((n0*k0) <= 16384))) ||
+        ((is_parallel) && (((m0 <= 32) || (n0 <= 32) || (k0 <= 32)) && ((m0 + n0 + k0) <= 100))))
     {
         err_t status = BLIS_NOT_YET_IMPLEMENTED;
         if (bli_is_notrans(blis_transa))
@@ -858,35 +1053,46 @@ void zgemm_
         }
     }
 #endif
-
-    // disabling sup path for single thread in zgemm until further tuning.
-    if (nt == 1)
+ 
+    err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
+    if (status == BLIS_SUCCESS)
     {
-        err_t status = bli_gemmsup(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
-        if (status == BLIS_SUCCESS)
-        {
-            AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-            AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-            return;
-        }
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        bli_finalize_auto();
+        return;
     }
 
     // fall back on native path when zgemm is not handled in sup path.
     bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
-    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
-    return;
-
 
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
     /* Finalize BLIS. */
     bli_finalize_auto();
 }// end of zgemm_
-
-
+#ifdef BLIS_ENABLE_BLAS
+void zgemm_
+     (
+       const f77_char* transa,
+       const f77_char* transb,
+       const f77_int*  m,
+       const f77_int*  n,
+       const f77_int*  k,
+       const dcomplex*    alpha,
+       const dcomplex*    a, const f77_int* lda,
+       const dcomplex*    b, const f77_int* ldb,
+       const dcomplex*    beta,
+             dcomplex*    c, const f77_int* ldc
+     )
+{
+    zgemm_blis_impl(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+#endif
 INSERT_GENTFUNC_BLAS_SC( gemm, gemm )
 
-void dzgemm_
+void dzgemm_blis_impl
      (
        const f77_char* transa,
        const f77_char* transb,
@@ -938,6 +1144,32 @@ void dzgemm_
         return;
     }
 
+    /* If alpha is zero scale C by beta and return early. */
+    if( PASTEMAC(z,eq0)( *alpha ))
+    {
+        bli_convert_blas_dim1(*m, m0);
+        bli_convert_blas_dim1(*n, n0);
+        const inc_t rs_c = 1;
+        const inc_t cs_c = *ldc;
+
+        PASTEMAC2(z,scalm,_ex)( BLIS_NO_CONJUGATE,
+                   0,
+                   BLIS_NONUNIT_DIAG,
+                   BLIS_DENSE,
+                   m0,
+                   n0,
+                   (dcomplex*) beta,
+                   (dcomplex*) c, rs_c, cs_c,
+                   NULL, NULL
+                 );
+
+        AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        /* Finalize BLIS. */
+        bli_finalize_auto();
+        return;
+    }
+
     /* Map BLAS chars to their corresponding BLIS enumerated type value. */
     bli_param_map_netlib_to_blis_trans( *transa, &blis_transa );
     bli_param_map_netlib_to_blis_trans( *transb, &blis_transb );
@@ -980,13 +1212,29 @@ void dzgemm_
     bli_obj_set_conjtrans( blis_transa, &ao );
     bli_obj_set_conjtrans( blis_transb, &bo );
 
-        // fall back on native path when zgemm is not handled in sup path.
+    // fall back on native path when zgemm is not handled in sup path.
     bli_gemmnat(&alphao, &ao, &bo, &betao, &co, NULL, NULL);
 
-
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_GEMM_STATS(AOCL_DTL_LEVEL_TRACE_1, *m, *n, *k);
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
     /* Finalize BLIS. */
     bli_finalize_auto();
 }// end of dzgemm_
-
+#ifdef BLIS_ENABLE_BLAS
+void dzgemm_
+     (
+       const f77_char* transa,
+       const f77_char* transb,
+       const f77_int*  m,
+       const f77_int*  n,
+       const f77_int*  k,
+       const dcomplex*    alpha,
+       const double*    a, const f77_int* lda,
+       const dcomplex*    b, const f77_int* ldb,
+       const dcomplex*    beta,
+             dcomplex*    c, const f77_int* ldc
+     )
+{
+    dzgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc );
+}
 #endif
diff --git a/frame/compat/bla_gemm_batch.c b/frame/compat/bla_gemm_batch.c
index 12a91d2bb7..0c41c0a0ce 100644
--- a/frame/compat/bla_gemm_batch.c
+++ b/frame/compat/bla_gemm_batch.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -44,7 +44,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa_array, \
        const f77_char* transb_array, \
@@ -134,14 +134,36 @@ void PASTEF77(ch,blasname) \
     } \
 \
     bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa_array, \
+       const f77_char* transb_array, \
+       const f77_int*  m_array, \
+       const f77_int*  n_array, \
+       const f77_int*  k_array, \
+       const ftype*    alpha_array, \
+       const ftype**   a_array, const f77_int* lda_array, \
+       const ftype**   b_array, const f77_int* ldb_array, \
+       const ftype*    beta_array, \
+             ftype**   c_array, const f77_int* ldc_array, \
+       const f77_int*  group_count, \
+       const f77_int*  group_size \
+     ) \
+{ \
+	PASTEF77S(ch,blasname)( transa_array, transb_array, m_array, n_array, k_array, \
+				alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, \
+				c_array, ldc_array, group_count, group_size ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa_array, \
        const f77_char* transb_array, \
@@ -246,11 +268,31 @@ void PASTEF77(ch,blasname) \
 \
     /* Finalize BLIS. */  \
     bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa_array, \
+       const f77_char* transb_array, \
+       const f77_int*  m_array, \
+       const f77_int*  n_array, \
+       const f77_int*  k_array, \
+       const ftype*    alpha_array, \
+       const ftype**   a_array, const f77_int* lda_array, \
+       const ftype**   b_array, const f77_int* ldb_array, \
+       const ftype*    beta_array, \
+             ftype**   c_array, const f77_int* ldc_array, \
+       const f77_int*  group_count, \
+       const f77_int*  group_size \
+     ) \
+{ \
+	PASTEF77S(ch,blasname)( transa_array, transb_array, m_array, n_array, k_array, \
+				alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, \
+				c_array, ldc_array, group_count, group_size ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( gemm_batch, gemm )
-#endif
 
diff --git a/frame/compat/bla_gemm_batch.h b/frame/compat/bla_gemm_batch.h
index f997f4b8ee..1aa3697037 100644
--- a/frame/compat/bla_gemm_batch.h
+++ b/frame/compat/bla_gemm_batch.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. 
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. 
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +39,24 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa_array, \
+       const f77_char* transb_array, \
+       const f77_int*  m_array, \
+       const f77_int*  n_array, \
+       const f77_int*  k_array, \
+       const ftype*    alpha_array, \
+       const ftype**   a_array, const f77_int* lda_array, \
+       const ftype**   b_array, const f77_int* ldb_array, \
+       const ftype*    beta_array, \
+             ftype**   c_array, const f77_int* ldc_array, \
+       const f77_int*  group_count, \
+       const f77_int*  group_size \
+     );\
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa_array, \
        const f77_char* transb_array, \
@@ -55,7 +72,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
        const f77_int*  group_size \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( gemm_batch )
-#endif
 
diff --git a/frame/compat/bla_gemmt.c b/frame/compat/bla_gemmt.c
index 7abad40acf..24a6d1324e 100644
--- a/frame/compat/bla_gemmt.c
+++ b/frame/compat/bla_gemmt.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -44,7 +44,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -134,14 +134,32 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */				 \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -247,10 +265,26 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
 	/* Finalize BLIS. */				 \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( gemmt, gemmt )
-#endif
diff --git a/frame/compat/bla_gemmt.h b/frame/compat/bla_gemmt.h
index 8043d68291..04d99d09ed 100644
--- a/frame/compat/bla_gemmt.h
+++ b/frame/compat/bla_gemmt.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +39,22 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_char* transb, \
+       const f77_int*  n, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -53,6 +68,4 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( gemmt )
-#endif
diff --git a/frame/compat/bla_gemv.c b/frame/compat/bla_gemv.c
index 9dba1b43c4..9bdc639d84 100644
--- a/frame/compat/bla_gemv.c
+++ b/frame/compat/bla_gemv.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,14 +35,10 @@
 
 #include "blis.h"
 
-
-//
-// Define BLAS-to-BLIS interfaces.
-//
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_int*  m, \
@@ -143,9 +139,24 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
-
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF77S(ch,blasname) \
+  ( transa, m, n, alpha, a, lda, x, incx, beta, y, incy ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( gemv, gemv )
-#endif
diff --git a/frame/compat/bla_gemv.h b/frame/compat/bla_gemv.h
index 22c8bf1c07..ffce9f41d2 100644
--- a/frame/compat/bla_gemv.h
+++ b/frame/compat/bla_gemv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,21 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_int*  m, \
@@ -51,7 +66,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    y, const f77_int* incy  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( gemv )
-#endif
 
diff --git a/frame/compat/bla_gemv_amd.c b/frame/compat/bla_gemv_amd.c
index 354f45fe1b..c2d743e80d 100644
--- a/frame/compat/bla_gemv_amd.c
+++ b/frame/compat/bla_gemv_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* transa, \
        const f77_int*  m, \
@@ -143,11 +143,26 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1); \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
-
+}\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     ) \
+{ \
+  PASTEF77S(ch,blasname) \
+   ( transa, m, n, alpha, a, lda, x, incx, beta, y, incy ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
-void dgemv_
+void dgemv_blis_impl
      (
        const f77_char* transa,
        const f77_int*  m,
@@ -268,9 +283,9 @@ void dgemv_
     rs_a = 1;
     cs_a = *lda;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         /* Call BLIS interface. */
         PASTEMAC2(d,gemv,BLIS_TAPI_EX_SUF)
@@ -331,8 +346,24 @@ void dgemv_
 
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
 }
-
-void sgemv_
+#ifdef BLIS_ENABLE_BLAS
+void dgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const double*    alpha,
+       const double*    a, const f77_int* lda,
+       const double*    x, const f77_int* incx,
+       const double*    beta,
+             double*    y, const f77_int* incy
+     )
+{
+  dgemv_blis_impl( transa, m, n, alpha, a, lda,
+                        x, incx, beta, y, incy );
+}
+#endif
+void sgemv_blis_impl
      (
        const f77_char* transa,
        const f77_int*  m,
@@ -451,9 +482,9 @@ void sgemv_
     rs_a = 1;
     cs_a = *lda;
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
       /* Call BLIS interface. */
       PASTEMAC2(s,gemv,BLIS_TAPI_EX_SUF)
@@ -510,9 +541,24 @@ void sgemv_
 
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
 }
-
-
-void cgemv_
+#ifdef BLIS_ENABLE_BLAS
+void sgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const float*    alpha,
+       const float*    a, const f77_int* lda,
+       const float*    x, const f77_int* incx,
+       const float*    beta,
+             float*    y, const f77_int* incy
+     )
+{
+  sgemv_blis_impl( transa, m, n, alpha, a, lda, 
+                        x, incx, beta, y, incy ); 
+}
+#endif
+void cgemv_blis_impl
      (
        const f77_char* transa,
        const f77_int*  m,
@@ -627,7 +673,7 @@ void cgemv_
     {
         conj_t conja = bli_extract_conj(blis_transa);
         scomplex rho;
-        if (bli_cpuid_is_avx_supported() == TRUE)
+        if (bli_cpuid_is_avx2fma3_supported() == TRUE)
         {
             bli_cdotv_zen_int5
             (
@@ -676,7 +722,7 @@ void cgemv_
         return;
     }
 
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         /* Call BLIS interface. */
         PASTEMAC2(c,gemv,BLIS_TAPI_EX_SUF)
@@ -733,9 +779,24 @@ void cgemv_
 
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
 }
-
-
-void zgemv_
+#ifdef BLIS_ENABLE_BLAS
+void cgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const scomplex* alpha,
+       const scomplex* a, const f77_int* lda,
+       const scomplex* x, const f77_int* incx,
+       const scomplex* beta,
+             scomplex* y, const f77_int* incy
+     )
+{
+  cgemv_blis_impl( transa, m, n, alpha, a, lda, 
+                        x, incx, beta, y, incy ); 
+}
+#endif
+void zgemv_blis_impl
      (
        const f77_char* transa,
        const f77_int*  m,
@@ -851,7 +912,7 @@ void zgemv_
         conj_t conja = bli_extract_conj(blis_transa);
         dcomplex rho;
 
-        if (bli_cpuid_is_avx_supported() == TRUE)
+        if (bli_cpuid_is_avx2fma3_supported() == TRUE)
         {
             bli_zdotv_zen_int5
             (
@@ -900,7 +961,7 @@ void zgemv_
         return;
     }
 
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         /* Call BLIS interface. */
         PASTEMAC2(z,gemv,BLIS_TAPI_EX_SUF)
@@ -957,7 +1018,22 @@ void zgemv_
 
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
 }
-
+#ifdef BLIS_ENABLE_BLAS
+void zgemv_
+     (
+       const f77_char* transa,
+       const f77_int*  m,
+       const f77_int*  n,
+       const dcomplex* alpha,
+       const dcomplex* a, const f77_int* lda,
+       const dcomplex* x, const f77_int* incx,
+       const dcomplex* beta,
+             dcomplex* y, const f77_int* incy
+     )
+{
+  zgemv_blis_impl( transa, m, n, alpha, a, lda, 
+                        x, incx, beta, y, incy ); 
+}
 
 
 #endif
diff --git a/frame/compat/bla_ger.c b/frame/compat/bla_ger.c
index b7613842ae..7051d06979 100644
--- a/frame/compat/bla_ger.c
+++ b/frame/compat/bla_ger.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjy, blasname, blisname ) \
 \
-void PASTEF772(ch,blasname,chc) \
+void PASTEF772S(ch,blasname,chc) \
      ( \
        const f77_int* m, \
        const f77_int* n, \
@@ -110,9 +110,23 @@ void PASTEF772(ch,blasname,chc) \
 \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+} \
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF772(ch,blasname,chc) \
+     ( \
+       const f77_int* m, \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy, \
+             ftype*   a, const f77_int* lda  \
+     ) \
+{ \
+    PASTEF772S(ch,blasname,chc) \
+     ( m, n, alpha, x, incx, y, incy, a, lda ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCDOT_BLAS( ger, ger )
-#endif
 
diff --git a/frame/compat/bla_ger.h b/frame/compat/bla_ger.h
index a31548f610..d908e5e781 100644
--- a/frame/compat/bla_ger.h
+++ b/frame/compat/bla_ger.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,19 @@
 #undef  GENTPROTDOT
 #define GENTPROTDOT( ftype, chxy, chc, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \
+     ( \
+       const f77_int* m, \
+       const f77_int* n, \
+       const ftype*   alpha, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy, \
+             ftype*   a, const f77_int* lda  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF772S(chxy,blasname,chc) \
      ( \
        const f77_int* m, \
        const f77_int* n, \
@@ -48,8 +61,6 @@ BLIS_EXPORT_BLAS void PASTEF772(chxy,blasname,chc) \
        const ftype*   y, const f77_int* incy, \
              ftype*   a, const f77_int* lda  \
      );
-
-#ifdef BLIS_ENABLE_BLAS
+     
 INSERT_GENTPROTDOT_BLAS( ger )
-#endif
 
diff --git a/frame/compat/bla_hemm.c b/frame/compat/bla_hemm.c
index 0e003012d2..c0af5fe0ba 100644
--- a/frame/compat/bla_hemm.c
+++ b/frame/compat/bla_hemm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -110,6 +110,25 @@ void PASTEF77(ch,blasname) \
     cs_b = *ldb; \
     rs_c = 1; \
     cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch, scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  BLIS_DENSE, \
+								  m0, \
+								  n0, \
+								  (ftype*) beta, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
     /* Call BLIS interface. */ \
     PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -132,14 +151,31 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -201,6 +237,25 @@ void PASTEF77(ch,blasname) \
     const inc_t cs_b = *ldb; \
     const inc_t rs_c = 1; \
     const inc_t cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch, scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  BLIS_DENSE, \
+								  m0, \
+								  n0, \
+								  (ftype*) beta, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
     const num_t   dt     = PASTEMAC(ch,type); \
 \
@@ -248,11 +303,26 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCCO_BLAS( hemm, hemm )
-#endif
 
diff --git a/frame/compat/bla_hemm.h b/frame/compat/bla_hemm.h
index 711877edee..8e07cbb04a 100644
--- a/frame/compat/bla_hemm.h
+++ b/frame/compat/bla_hemm.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +40,21 @@
 #undef  GENTPROTCO
 #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -52,7 +67,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTCO_BLAS( hemm )
-#endif
 
diff --git a/frame/compat/bla_hemv.c b/frame/compat/bla_hemv.c
index a722f3095d..8677004bca 100644
--- a/frame/compat/bla_hemv.c
+++ b/frame/compat/bla_hemv.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -113,9 +113,24 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+} \
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) \
+     ( uploa, m, alpha, a, lda, x, incx, beta, y, incy ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCCO_BLAS( hemv, hemv )
-#endif
 
diff --git a/frame/compat/bla_hemv.h b/frame/compat/bla_hemv.h
index 4e82301146..1ca91b3d32 100644
--- a/frame/compat/bla_hemv.h
+++ b/frame/compat/bla_hemv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,20 @@
 #undef  GENTPROTCO
 #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -49,8 +63,6 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
        const ftype*    beta, \
              ftype*    y, const f77_int* incy  \
      );
-
-#ifdef BLIS_ENABLE_BLAS
+     
 INSERT_GENTPROTCO_BLAS( hemv )
-#endif
 
diff --git a/frame/compat/bla_her.c b/frame/compat/bla_her.c
index abe0f1e372..6eda421645 100755
--- a/frame/compat/bla_her.c
+++ b/frame/compat/bla_her.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -103,9 +103,22 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype_r*  alpha, \
+       const ftype*    x, const f77_int* incx, \
+             ftype*    a, const f77_int* lda  \
+     ) \
+{\
+    PASTEF77S(ch,blasname) \
+    ( uploa, m, alpha, x, incx, a, lda ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCCO_BLAS( her, her )
-#endif
 
diff --git a/frame/compat/bla_her.h b/frame/compat/bla_her.h
index b9ae30d903..1f70fa5a89 100644
--- a/frame/compat/bla_her.h
+++ b/frame/compat/bla_her.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,18 @@
 #undef  GENTPROTCO
 #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype_r*  alpha, \
+       const ftype*    x, const f77_int* incx, \
+             ftype*    a, const f77_int* lda  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -47,8 +59,6 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
        const ftype*    x, const f77_int* incx, \
              ftype*    a, const f77_int* lda  \
      );
-
-#ifdef BLIS_ENABLE_BLAS
+     
 INSERT_GENTPROTCO_BLAS( her )
-#endif
 
diff --git a/frame/compat/bla_her2.c b/frame/compat/bla_her2.c
index ce65be0cb5..ebf9211588 100644
--- a/frame/compat/bla_her2.c
+++ b/frame/compat/bla_her2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -111,9 +111,23 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    y, const f77_int* incy, \
+             ftype*    a, const f77_int* lda  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) \
+     ( uploa, m, alpha, x, incx, y, incy, a, lda ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCCO_BLAS( her2, her2 )
-#endif
 
diff --git a/frame/compat/bla_her2.h b/frame/compat/bla_her2.h
index 7cf0bb867c..1c11f26d3f 100644
--- a/frame/compat/bla_her2.h
+++ b/frame/compat/bla_her2.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,19 @@
 #undef  GENTPROTCO
 #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    y, const f77_int* incy, \
+             ftype*    a, const f77_int* lda  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -48,8 +61,6 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
        const ftype*    y, const f77_int* incy, \
              ftype*    a, const f77_int* lda  \
      );
-
-#ifdef BLIS_ENABLE_BLAS
+     
 INSERT_GENTPROTCO_BLAS( her2 )
-#endif
 
diff --git a/frame/compat/bla_her2k.c b/frame/compat/bla_her2k.c
index e21a2cda41..78456ee77f 100755
--- a/frame/compat/bla_her2k.c
+++ b/frame/compat/bla_her2k.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -116,6 +116,36 @@ void PASTEF77(ch,blasname) \
 	cs_b = *ldb; \
 	rs_c = 1; \
 	cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		ftype beta_complex; \
+		beta_complex.real = *beta; \
+		beta_complex.imag = 0.0; \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  blis_uploc, \
+								  m0, \
+								  m0, \
+								  (ftype*) &beta_complex, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		/* The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
+		   diagonal elements. Mathematically, the imaginary components of
+		   diagonal elements of a Hermitian rank-k product should always be
+		   zero. However, in practice, they sometimes accumulate meaningless
+		   non-zero values. To prevent this, we explicitly set those values
+		   to zero before returning.
+		*/ \
+		PASTEMAC2(ch,setid,_ex)( 0, m0, m0, (void*)alpha, c, rs_c, cs_c, NULL, NULL ); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -137,14 +167,31 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype_r*  beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -212,6 +259,36 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_b = *ldb; \
 	const inc_t rs_c = 1; \
 	const inc_t cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		ftype beta_complex; \
+		beta_complex.real = *beta; \
+		beta_complex.imag = 0.0; \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  blis_uploc, \
+								  m0, \
+								  m0, \
+								  (ftype*) &beta_complex, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		/* The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
+		   diagonal elements. Mathematically, the imaginary components of
+		   diagonal elements of a Hermitian rank-k product should always be
+		   zero. However, in practice, they sometimes accumulate meaningless
+		   non-zero values. To prevent this, we explicitly set those values
+		   to zero before returning.
+		*/ \
+		PASTEMAC2(ch,setid,_ex)( 0, m0, m0, (void*)alpha, c, rs_c, cs_c, NULL, NULL ); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	const num_t   dt_r   = PASTEMAC(chr,type); \
 	const num_t   dt     = PASTEMAC(ch,type); \
@@ -258,11 +335,26 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype_r*  beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCCO_BLAS( her2k, her2k )
-#endif
 
diff --git a/frame/compat/bla_her2k.h b/frame/compat/bla_her2k.h
index c771f78d4c..6a6f9ad5c7 100644
--- a/frame/compat/bla_her2k.h
+++ b/frame/compat/bla_her2k.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +40,21 @@
 #undef  GENTPROTCO
 #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype_r*  beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -52,7 +67,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTCO_BLAS( her2k )
-#endif
 
diff --git a/frame/compat/bla_herk.c b/frame/compat/bla_herk.c
index 36188e6a66..9678ec4845 100755
--- a/frame/compat/bla_herk.c
+++ b/frame/compat/bla_herk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -112,6 +112,36 @@ void PASTEF77(ch,blasname) \
 	cs_a = *lda; \
 	rs_c = 1; \
 	cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(chr,eq0)( *alpha ) ) \
+	{ \
+		ftype beta_complex; \
+		beta_complex.real = *beta; \
+		beta_complex.imag = 0.0; \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  blis_uploc, \
+								  m0, \
+								  m0, \
+								  (ftype*) &beta_complex, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		/* The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
+		   diagonal elements. Mathematically, the imaginary components of
+		   diagonal elements of a Hermitian rank-k product should always be
+		   zero. However, in practice, they sometimes accumulate meaningless
+		   non-zero values. To prevent this, we explicitly set those values
+		   to zero before returning.
+		*/ \
+		PASTEMAC2(ch,setid,_ex)( 0, m0, m0, (void*)alpha, c, rs_c, cs_c, NULL, NULL ); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -131,14 +161,30 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype_r*  alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype_r*  beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ftype, ftype_r, ch, chr, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -206,6 +252,36 @@ void PASTEF77(ch,blasname) \
 \
 	const num_t   dt_r   = PASTEMAC(chr,type); \
 	const num_t   dt     = PASTEMAC(ch,type); \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(chr,eq0)( *alpha ) ) \
+	{ \
+		ftype beta_complex; \
+		beta_complex.real = *beta; \
+		beta_complex.imag = 0.0; \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  blis_uploc, \
+								  m0, \
+								  m0, \
+								  (ftype*) &beta_complex, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		/* The Hermitian rank-k product was computed as Re(alpha)*A*A', even for the
+		   diagonal elements. Mathematically, the imaginary components of
+		   diagonal elements of a Hermitian rank-k product should always be
+		   zero. However, in practice, they sometimes accumulate meaningless
+		   non-zero values. To prevent this, we explicitly set those values
+		   to zero before returning.
+		*/ \
+		PASTEMAC2(ch,setid,_ex)( 0, m0, m0, (void*)alpha, c, rs_c, cs_c, NULL, NULL ); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	const struc_t strucc = BLIS_HERMITIAN; \
 \
@@ -242,11 +318,25 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype_r*  alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype_r*  beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCCO_BLAS( herk, herk )
-#endif
 
diff --git a/frame/compat/bla_herk.h b/frame/compat/bla_herk.h
index e649a74abb..6085e769ce 100644
--- a/frame/compat/bla_herk.h
+++ b/frame/compat/bla_herk.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +40,20 @@
 #undef  GENTPROTCO
 #define GENTPROTCO( ftype, ftype_r, ch, chr, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype_r*  alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype_r*  beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -51,7 +65,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTCO_BLAS( herk )
-#endif
 
diff --git a/frame/compat/bla_imatcopy.c b/frame/compat/bla_imatcopy.c
index dd07bb5314..94d2ae407b 100644
--- a/frame/compat/bla_imatcopy.c
+++ b/frame/compat/bla_imatcopy.c
@@ -641,4 +641,4 @@ static dim_t bli_ziMatCopy_cr(dim_t rows,dim_t cols,const dcomplex alpha,dcomple
 return(0);
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/bla_imatcopy.h b/frame/compat/bla_imatcopy.h
index 4b29a212c2..f2da1e39f3 100644
--- a/frame/compat/bla_imatcopy.h
+++ b/frame/compat/bla_imatcopy.h
@@ -46,4 +46,4 @@ BLIS_EXPORT_BLAS void cimatcopy_ (f77_char* trans, f77_int* rows, f77_int* cols,
 
 BLIS_EXPORT_BLAS void zimatcopy_ (f77_char* trans, f77_int* rows, f77_int* cols, const dcomplex* alpha,dcomplex* aptr, f77_int* lda, f77_int* ldb);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/bla_nrm2.c b/frame/compat/bla_nrm2.c
index 576d9eda8c..db866a83ca 100755
--- a/frame/compat/bla_nrm2.c
+++ b/frame/compat/bla_nrm2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCR2
 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
 \
-ftype_r PASTEF772(chr,chx,blasname) \
+ftype_r PASTEF772S(chr,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
@@ -80,9 +80,18 @@ ftype_r PASTEF772(chr,chx,blasname) \
 	bli_finalize_auto(); \
 \
 	return norm; \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+ftype_r PASTEF772(chr,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+  return PASTEF772S(chr,chx,blasname)( n, x, incx );\
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCR2_BLAS( nrm2, normfv )
-#endif
 
diff --git a/frame/compat/bla_nrm2.h b/frame/compat/bla_nrm2.h
index a8bc25ef48..38524c9487 100644
--- a/frame/compat/bla_nrm2.h
+++ b/frame/compat/bla_nrm2.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,13 +40,19 @@
 #undef  GENTPROTR2
 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS ftype_r PASTEF772(chr,chx,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS ftype_r PASTEF772S(chr,chx,blasname) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTR2_BLAS( nrm2 )
-#endif
 
diff --git a/frame/compat/bla_omatadd.c b/frame/compat/bla_omatadd.c
index 2ae7ee5ef7..b34f4965d3 100644
--- a/frame/compat/bla_omatadd.c
+++ b/frame/compat/bla_omatadd.c
@@ -507,4 +507,4 @@ static dim_t bli_zoMatAdd_cn(dim_t rows,dim_t cols,const dcomplex alpha,dcomplex
  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2);
  return(0);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/bla_omatcopy.c b/frame/compat/bla_omatcopy.c
index 339ecdb25a..80a9650565 100644
--- a/frame/compat/bla_omatcopy.c
+++ b/frame/compat/bla_omatcopy.c
@@ -934,4 +934,4 @@ static dim_t bli_zoMatCopy_cc(dim_t rows,dim_t cols,const dcomplex alpha,const d
  return(0);
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/bla_omatcopy.h b/frame/compat/bla_omatcopy.h
index e3c5485003..fc2161d638 100644
--- a/frame/compat/bla_omatcopy.h
+++ b/frame/compat/bla_omatcopy.h
@@ -46,4 +46,4 @@ BLIS_EXPORT_BLAS void comatcopy_ (f77_char* trans, f77_int* rows, f77_int* cols,
 
 BLIS_EXPORT_BLAS void zomatcopy_ (f77_char* trans, f77_int* rows, f77_int* cols, const dcomplex* alpha, const dcomplex* aptr, f77_int* lda, dcomplex* bptr, f77_int* ldb);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/bla_omatcopy2.c b/frame/compat/bla_omatcopy2.c
index 4aaf3bf5b7..d5ab82531f 100644
--- a/frame/compat/bla_omatcopy2.c
+++ b/frame/compat/bla_omatcopy2.c
@@ -939,4 +939,4 @@ static dim_t bli_zoMatCopy2_cc(dim_t rows,dim_t cols,const dcomplex alpha,const
  return(0);
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/bla_omatcopy2.h b/frame/compat/bla_omatcopy2.h
index a35f0aefb0..f4dc9621f4 100644
--- a/frame/compat/bla_omatcopy2.h
+++ b/frame/compat/bla_omatcopy2.h
@@ -47,4 +47,4 @@ BLIS_EXPORT_BLAS void comatcopy2_ (f77_char* trans, f77_int* rows, f77_int* cols
 
 BLIS_EXPORT_BLAS void zomatcopy2_ (f77_char* trans, f77_int* rows, f77_int* cols, const dcomplex* alpha, const dcomplex* aptr, f77_int* lda,f77_int* stridea, dcomplex* bptr, f77_int* ldb,f77_int* strideb);
 
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/bla_scal.c b/frame/compat/bla_scal.c
index b9651577eb..1c854a6e57 100644
--- a/frame/compat/bla_scal.c
+++ b/frame/compat/bla_scal.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCSCAL
 #define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
 \
-void PASTEF772(chx,cha,blasname) \
+void PASTEF772S(chx,cha,blasname) \
      ( \
        const f77_int* n, \
        const ftype_a* alpha, \
@@ -90,8 +90,17 @@ void PASTEF772(chx,cha,blasname) \
   AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+}\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF772(chx,cha,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_a* alpha, \
+       ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+  PASTEF772S(chx,cha,blasname)( n, alpha, x, incx ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCSCAL_BLAS( scal, scalv )
-#endif
diff --git a/frame/compat/bla_scal.h b/frame/compat/bla_scal.h
index c8e898b6ba..8e891c8218 100644
--- a/frame/compat/bla_scal.h
+++ b/frame/compat/bla_scal.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,14 +40,21 @@
 #undef  GENTPROTSCAL
 #define GENTPROTSCAL( ftype_a, ftype_x, cha, chx, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF772(chx,cha,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_a* alpha, \
+       ftype_x* x, const f77_int* incx  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF772S(chx,cha,blasname) \
      ( \
        const f77_int* n, \
        const ftype_a* alpha, \
        ftype_x* x, const f77_int* incx  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTSCAL_BLAS( scal )
-#endif
 
diff --git a/frame/compat/bla_scal_amd.c b/frame/compat/bla_scal_amd.c
index 178776a149..041c1b6a87 100644
--- a/frame/compat/bla_scal_amd.c
+++ b/frame/compat/bla_scal_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,6 +35,22 @@
 
 #include "blis.h"
 
+/*
+  Early return conditions
+  ------------------------
+
+  1. When n <= 0 where n is the length of the vector passed
+  2. When incx <= 0 where incx is the storage spacing between elements of
+     the vector passed
+  3. When alpha == 1 where alpha is the scalar value by which the vector is
+     to be scaled
+
+  NaN propagation expectation
+  --------------------------
+
+  1. When alpha == NaN - Propogate the NaN to the vector
+  2. When alpha == 0 - Perform the SCALV operation completely and don't use setv.
+*/
 
 //
 // Define BLAS-to-BLIS interfaces.
@@ -42,7 +58,7 @@
 #undef  GENTFUNCSCAL
 #define GENTFUNCSCAL( ftype_x, ftype_a, chx, cha, blasname, blisname ) \
 \
-void PASTEF772(chx,cha,blasname) \
+void PASTEF772S(chx,cha,blasname) \
      ( \
        const f77_int* n, \
        const ftype_a* alpha, \
@@ -90,11 +106,20 @@ void PASTEF772(chx,cha,blasname) \
   AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
-
-#ifdef BLIS_ENABLE_BLAS
+}\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF772(chx,cha,blasname) \
+     ( \
+       const f77_int* n, \
+       const ftype_a* alpha, \
+       ftype_x* x, const f77_int* incx  \
+     ) \
+{ \
+  PASTEF772S(chx,cha,blasname)( n, alpha, x, incx ); \
+} \
+)
 
-void sscal_
+void sscal_blis_impl
      (
        const f77_int* n,
        const float* alpha,
@@ -109,10 +134,11 @@ void sscal_
     /* Initialize BLIS. */
     //bli_init_auto();
 
-	if (*n == 0 || alpha == NULL) {
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-		return;
-	}
+    if ((*n) <= 0 || alpha == NULL || bli_seq1(*alpha))
+    {
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+      return;
+    }
 
     /* Convert/typecast negative values of n to zero. */
     if ( *n < 0 ) n0 = ( dim_t )0;
@@ -145,36 +171,75 @@ void sscal_
         incx0 = ( inc_t )(*incx);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
-	    bli_sscalv_zen_int10
-		    (
-		     BLIS_NO_CONJUGATE,
-		     n0,
-		     (float *)alpha,
-		     x0, incx0,
-		     NULL
-		    );
+    /*
+      According to the BLAS definition, return early when incx <= 0
+    */
+    if (incx0 <= 0)
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
     }
-    else{
-	    PASTEMAC2(s,scalv,BLIS_TAPI_EX_SUF) \
-		    ( \
-		      BLIS_NO_CONJUGATE,\
-		      n0, \
-		      (float *)alpha,\
-		      x0, incx0,\
-		      NULL, \
-		      NULL  \
-		    );\
+
+    cntx_t *cntx = NULL;
+
+    // Query the architecture ID
+    arch_t id = bli_arch_query_id();
+
+    /*
+      Function pointer declaration for the function
+      that will be used by this API
+    */
+    sscalv_ker_ft scalv_ker_ptr; // DSCALV
+
+    // Pick the kernel based on the architecture ID
+    switch (id)
+    {
+      case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+        scalv_ker_ptr = bli_sscalv_zen_int_avx512;
+
+        break;
+#endif
+      case BLIS_ARCH_ZEN:
+      case BLIS_ARCH_ZEN2:
+      case BLIS_ARCH_ZEN3:
+        scalv_ker_ptr = bli_sscalv_zen_int10;
+
+        break;
+      default:
+
+        // For non-Zen architectures, query the context
+        cntx = bli_gks_query_cntx();
+
+        // Query the context for the kernel function pointers for sscalv
+        scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_FLOAT, BLIS_SCALV_KER, cntx);
     }
 
+    scalv_ker_ptr
+    (
+      BLIS_NO_CONJUGATE,
+      n0,
+      (float *)alpha,
+      x0, incx0,
+      cntx
+    );
+
     /* Finalize BLIS. */
-//    bli_finalize_auto();
+    //    bli_finalize_auto();
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
 }
-
-void dscal_
+#ifdef BLIS_ENABLE_BLAS
+void sscal_
+     (
+       const f77_int* n,
+       const float* alpha,
+       float*   x, const f77_int* incx
+     )
+{
+  sscal_blis_impl( n, alpha, x, incx );
+}
+#endif
+void dscal_blis_impl
      (
        const f77_int* n,
        const double* alpha,
@@ -183,24 +248,31 @@ void dscal_
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
     AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'D', (void *)alpha, *n, *incx );
-    dim_t  n0;
+    dim_t  n_elem;
     double* x0;
     inc_t  incx0;
 
     /* Initialize BLIS  */
     //bli_init_auto();
 
-	if (*n == 0 || alpha == NULL) {
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
-		return;
-	}
-
     /* Convert typecast negative values of n to zero. */
-    if ( *n < 0 ) n0 = ( dim_t )0;
-    else              n0 = ( dim_t )(*n);
+    if ( *n < 0 ) n_elem = ( dim_t )0;
+    else          n_elem = ( dim_t )(*n);
+
+    /*
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
+    */
+    if ((*n) <= 0 || alpha == NULL || bli_deq1(*alpha) || (*incx) <= 0)
+    {
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+        return;
+    }
 
     /* If the input increments are negative, adjust the pointers so we can
-       use positive increments instead. */
+       use positive increments instead.
+       * This check is redundant and can be safely removed
+       */
     if ( *incx < 0 )
     {
         /* The semantics of negative stride in BLAS are that the vector
@@ -216,7 +288,7 @@ void dscal_
         pass in the address to the (n-1)th (i.e., the bottom-most or
         right-most) element along with a negative stride. */
 
-        x0    = (x) + (n0-1)*(-*incx);
+        x0    = (x) + (n_elem-1)*(-*incx);
         incx0 = ( inc_t )(*incx);
 
     }
@@ -226,35 +298,432 @@ void dscal_
         incx0 = ( inc_t )(*incx);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE){
-	    bli_dscalv_zen_int10
-		    (
-		     BLIS_NO_CONJUGATE,
-		     n0,
-		     (double*) alpha,
-		     x0, incx0,
-		     NULL
-		    );
+     // Definition of function pointer
+    dscalv_ker_ft scalv_ker_ptr;
+
+    cntx_t *cntx = NULL;
+
+    // Query the architecture ID
+    arch_t arch_id_local = bli_arch_query_id();
+
+    // Pick the kernel based on the architecture ID
+    switch (arch_id_local)
+    {
+      case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+        scalv_ker_ptr = bli_dscalv_zen_int_avx512;
+
+        break;
+#endif
+      case BLIS_ARCH_ZEN:
+      case BLIS_ARCH_ZEN2:
+      case BLIS_ARCH_ZEN3:
+
+          // AVX2 Kernel
+          scalv_ker_ptr = bli_dscalv_zen_int10;
+          break;
+
+      default:
+
+          // Query the context
+          cntx = bli_gks_query_cntx();
+
+          // Query the function pointer using the context
+          scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DOUBLE, BLIS_SCALV_KER, cntx);
     }
-    else{
-	    PASTEMAC2(d,scalv,BLIS_TAPI_EX_SUF) \
-		    ( \
-		      BLIS_NO_CONJUGATE,\
-		      n0, \
-		      (double *)alpha,\
-		      x0, incx0,\
-		      NULL, \
-		      NULL  \
-		    );\
+
+#ifdef BLIS_ENABLE_OPENMP
+    /*
+      Initializing the number of thread to one
+      to avoid compiler warnings
+    */
+    dim_t nt = 1;
+
+    /*
+      For the given problem size and architecture, the function
+      returns the optimum number of threads with AOCL dynamic enabled
+      else it returns the number of threads requested by the user.
+    */
+    bli_nthreads_l1
+    (
+      BLIS_SCALV_KER,
+      BLIS_DOUBLE,
+      BLIS_DOUBLE,
+      arch_id_local,
+      n_elem,
+      &nt
+    );
+
+    /*
+      If the number of optimum threads is 1, the OpenMP overhead
+      is avoided by calling the function directly
+    */
+    if (nt == 1)
+    {
+#endif
+        scalv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          n_elem,
+          (double *)alpha,
+          x0, incx0,
+          cntx
+        );
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+
+        return;
+#ifdef BLIS_ENABLE_OPENMP
     }
 
+    _Pragma("omp parallel num_threads(nt)")
+    {
+        dim_t start, length;
+
+        // Get the thread ID
+        dim_t thread_id = omp_get_thread_num();
+
+        // Get the actual number of threads spawned
+        dim_t nt_use = omp_get_num_threads();
+
+        /*
+          Calculate the compute range for the current thread
+          based on the actual number of threads spawned
+        */
+        bli_thread_vector_partition
+        (
+          n_elem,
+          nt_use,
+          &start, &length,
+          thread_id
+        );
+
+        // Adjust the local pointer for computation
+        double *x_thread_local = x0 + (start * incx0);
+
+        // Invoke the function based on the kernel function pointer
+        scalv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          length,
+          (double *)alpha,
+          x_thread_local, incx0,
+          cntx
+        );
+    }
+#endif
+
+
     /* Finalize BLIS. */
-//    bli_finalize_auto();
+    // bli_finalize_auto();
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+}
+#ifdef BLIS_ENABLE_BLAS
+void dscal_
+     (
+       const f77_int* n,
+       const double* alpha,
+       double*   x, const f77_int* incx
+     )
+{
+  dscal_blis_impl( n, alpha, x, incx );
+}
+#endif
+void zdscal_blis_impl
+     (
+       const f77_int* n,
+       const double* alpha,
+       dcomplex*   x, const f77_int* incx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+    AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *) alpha, *n, *incx );
+    dim_t  n_elem;
+    dcomplex* x0;
+    inc_t  incx0;
+    /* Initialize BLIS. */
+    //bli_init_auto();
+
+    /* Convert/typecast negative values of n to zero. */
+    if ( *n < 0 ) n_elem = ( dim_t )0;
+    else          n_elem = ( dim_t )(*n);
+
+    /*
+      Return early when n <= 0 or incx <= 0 or alpha == 1.0 - BLAS exception
+      Return early when alpha pointer is NULL - BLIS exception
+    */
+    if (*n <= 0 || alpha == NULL || bli_deq1(*alpha) || incx <= 0)
+    {
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+      return;
+    }
+
+    /* If the input increments are negative, adjust the pointers so we can
+       use positive increments instead. */
+    if ( *incx < 0 )
+    {
+        /* The semantics of negative stride in BLAS are that the vector
+        operand be traversed in reverse order. (Another way to think
+        of this is that negative strides effectively reverse the order
+        of the vector, but without any explicit data movements.) This
+        is also how BLIS interprets negative strides. The differences
+        is that with BLAS, the caller *always* passes in the 0th (i.e.,
+        top-most or left-most) element of the vector, even when the
+        stride is negative. By contrast, in BLIS, negative strides are
+        used *relative* to the vector address as it is given. Thus, in
+        BLIS, if this backwards traversal is desired, the caller *must*
+        pass in the address to the (n-1)th (i.e., the bottom-most or
+        right-most) element along with a negative stride. */
+
+        x0    = (x) + (n_elem-1)*(-*incx);
+        incx0 = ( inc_t )(*incx);
+    }
+    else
+    {
+        x0    = (x);
+        incx0 = ( inc_t )(*incx);
+    }
+
+    dcomplex  alpha_cast;
+    alpha_cast.real = *alpha;
+    alpha_cast.imag = 0.0;
+
+    // Definition of function pointer
+    zscalv_ker_ft scalv_ker_ptr;
+
+    cntx_t *cntx = NULL;
+
+    // Query the architecture ID
+    arch_t arch_id_local = bli_arch_query_id();
+
+    // Pick the kernel based on the architecture ID
+    switch (arch_id_local)
+    {
+      case BLIS_ARCH_ZEN4:
+      case BLIS_ARCH_ZEN:
+      case BLIS_ARCH_ZEN2:
+      case BLIS_ARCH_ZEN3:
+
+          // AVX2 Kernel
+          scalv_ker_ptr = bli_zdscalv_zen_int10;
+          break;
+
+      default:
+
+          // Query the context
+          cntx = bli_gks_query_cntx();
+
+          // Query the function pointer using the context
+          scalv_ker_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
+    }
+
+#ifdef BLIS_ENABLE_OPENMP
+
+    /*
+      Initializing the number of thread to one
+      to avoid compiler warnings
+    */
+    dim_t nt = 1;
+
+    /*
+      For the given problem size and architecture, the function
+      returns the optimum number of threads with AOCL dynamic enabled
+      else it returns the number of threads requested by the user.
+    */
+    bli_nthreads_l1
+    (
+      BLIS_SCALV_KER,
+      BLIS_DCOMPLEX,
+      BLIS_DOUBLE,
+      arch_id_local,
+      n_elem,
+      &nt
+    );
+
+    /*
+      If the number of optimum threads is 1, the OpenMP overhead
+      is avoided by calling the function directly
+    */
+    if (nt == 1)
+    {
+#endif
+        scalv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          n_elem,
+          (dcomplex *)&alpha_cast,
+          x0, incx0,
+          cntx
+        );
+
+        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+
+        return;
+#ifdef BLIS_ENABLE_OPENMP
+    }
+
+    _Pragma("omp parallel num_threads(nt)")
+    {
+        dim_t start, length;
+
+        // Get the thread ID
+        dim_t thread_id = omp_get_thread_num();
+
+        // Get the actual number of threads spawned
+        dim_t nt_use = omp_get_num_threads();
+
+        /*
+          Calculate the compute range for the current thread
+          based on the actual number of threads spawned
+        */
+        bli_thread_vector_partition
+        (
+          n_elem,
+          nt_use,
+          &start, &length,
+          thread_id
+        );
+
+        // Adjust the local pointer for computation
+        dcomplex *x_thread_local = x0 + (start * incx0);
+
+        // Invoke the function based on the kernel function pointer
+        scalv_ker_ptr
+        (
+          BLIS_NO_CONJUGATE,
+          length,
+          (dcomplex *)&alpha_cast,
+          x_thread_local, incx0,
+          cntx
+        );
+    }
+#endif
+
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
 }
+#ifdef BLIS_ENABLE_BLAS
+void zdscal_
+     (
+       const f77_int* n,
+       const double* alpha,
+       dcomplex*   x, const f77_int* incx
+     )
+{
+    zdscal_blis_impl( n, alpha, x, incx );
+}
+#endif
 
-INSERT_GENTFUNCSCAL_BLAS_CZ( scal, scalv )
+void zscal_blis_impl
+     (
+       const f77_int* n,
+       const dcomplex* alpha,
+       dcomplex*   x, const f77_int* incx
+     )
+{
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_1)
+  AOCL_DTL_LOG_SCAL_INPUTS(AOCL_DTL_LEVEL_TRACE_1, 'Z', (void *)alpha, *n, *incx);
+  dim_t n0;
+  dcomplex *x0;
+  inc_t incx0;
 
+  // When n is zero or the alpha pointer passed is null, return early
+  if ((*n == 0) || (alpha == NULL))
+  {
+      AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1);
+      return;
+  }
+
+  /* Convert/typecast negative values of n to zero. */
+  if (*n < 0)
+    n0 = (dim_t)0;
+  else
+    n0 = (dim_t)(*n);
+
+  /* If the input increments are negative, adjust the pointers so we can
+    use positive increments instead. */
+  if (*incx < 0)
+  {
+    /* The semantics of negative stride in BLAS are that the vector
+    operand be traversed in reverse order. (Another way to think
+    of this is that negative strides effectively reverse the order
+    of the vector, but without any explicit data movements.) This
+    is also how BLIS interprets negative strides. The differences
+    is that with BLAS, the caller *always* passes in the 0th (i.e.,
+    top-most or left-most) element of the vector, even when the
+    stride is negative. By contrast, in BLIS, negative strides are
+    used *relative* to the vector address as it is given. Thus, in
+    BLIS, if this backwards traversal is desired, the caller *must*
+    pass in the address to the (n-1)th (i.e., the bottom-most or
+    right-most) element along with a negative stride. */
+
+    x0 = (x) + (n0 - 1) * (-*incx);
+    incx0 = (inc_t)(*incx);
+  }
+  else
+  {
+    x0 = (x);
+    incx0 = (inc_t)(*incx);
+  }
+
+  /* If the incx is zero, return early. */
+  if (bli_zero_dim1(incx0))
+    return;
+
+  // Definition of function pointer
+  zscalv_ker_ft scalv_fun_ptr;
+
+  cntx_t* cntx = NULL;
+
+  // Query the architecture ID
+  arch_t id = bli_arch_query_id();
+
+  // Pick the kernel based on the architecture ID
+  switch (id)
+  {
+  case BLIS_ARCH_ZEN4:
+  case BLIS_ARCH_ZEN:
+  case BLIS_ARCH_ZEN2:
+  case BLIS_ARCH_ZEN3:
+
+    // AVX2 Kernel
+    scalv_fun_ptr = bli_zscalv_zen_int;
+    break;
+
+  default:
+
+    // Query the context
+    cntx = bli_gks_query_cntx();
+
+    // Query the function pointer using the context
+    scalv_fun_ptr = bli_cntx_get_l1v_ker_dt(BLIS_DCOMPLEX, BLIS_SCALV_KER, cntx);
+  }
+
+  /* The expectation is that the condition to return early for vector dimension is zero
+  or the real part of alpha is 1 and imaginary part 0 is inside the compute kernel called */
+
+  // Call the function based on the function pointer assigned above
+  scalv_fun_ptr
+  (
+    BLIS_NO_CONJUGATE,
+    n0,
+    (dcomplex*) alpha,
+    x0, incx0,
+    cntx
+  );
+
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+}
+#ifdef BLIS_ENABLE_BLAS
+void zscal_
+     (
+       const f77_int* n,
+       const dcomplex* alpha,
+       dcomplex*   x, const f77_int* incx
+     )
+{
+    zscal_blis_impl(n, alpha, x, incx);
+}
 #endif
+
+INSERT_GENTFUNCSCAL_BLAS_C( scal, scalv )
+
diff --git a/frame/compat/bla_swap.c b/frame/compat/bla_swap.c
index d653426478..e4c4142f06 100644
--- a/frame/compat/bla_swap.c
+++ b/frame/compat/bla_swap.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        ftype*   x, const f77_int* incx, \
@@ -80,8 +80,18 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+} \
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       ftype*   x, const f77_int* incx, \
+       ftype*   y, const f77_int* incy  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) (n, x, incx, y, incy); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( swap, swapv )
-#endif
diff --git a/frame/compat/bla_swap.h b/frame/compat/bla_swap.h
index 54c0613a92..669c00955d 100644
--- a/frame/compat/bla_swap.h
+++ b/frame/compat/bla_swap.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,14 +40,21 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_int* n, \
+       ftype*   x, const f77_int* incx, \
+       ftype*   y, const f77_int* incy \
+     ); \
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        ftype*   x, const f77_int* incx, \
        ftype*   y, const f77_int* incy \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( swap )
-#endif
 
diff --git a/frame/compat/bla_swap_amd.c b/frame/compat/bla_swap_amd.c
index 617c78a4aa..d85d50b915 100644
--- a/frame/compat/bla_swap_amd.c
+++ b/frame/compat/bla_swap_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_int* n, \
        ftype*   x, const f77_int* incx, \
@@ -80,11 +80,21 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
-
-#ifdef BLIS_ENABLE_BLAS
+} \
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+	( \
+		const f77_int* n, \
+		ftype*   x, const f77_int* incx, \
+		ftype*   y, const f77_int* incy  \
+		) \
+{ \
+   PASTEF77S(ch,blasname)( n, x, incx, y, incy ); \
+}\
+)
 
-void sswap_
+void sswap_blis_impl
      (
        const f77_int* n,
        float*   x, const f77_int* incx,
@@ -145,9 +155,9 @@ void sswap_
         incy0 = ( inc_t )(*incy);
     }
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
         /* Call BLIS kernel */
 	    bli_sswapv_zen_int8
 		    (
@@ -172,8 +182,18 @@ void sswap_
 //    bli_finalize_auto();
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
 }
-
-void dswap_
+#ifdef BLIS_ENABLE_BLAS
+void sswap_
+     (
+       const f77_int* n,
+       float*   x, const f77_int* incx,
+       float*   y, const f77_int* incy
+     )
+{
+    sswap_blis_impl( n, x, incx, y, incy );
+}
+#endif 
+void dswap_blis_impl
      (
        const f77_int* n,
        double*   x, const f77_int* incx,
@@ -235,9 +255,9 @@ void dswap_
     }
 
 
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE) {
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE) {
 	    bli_dswapv_zen_int8
 		    (
 		     n0,
@@ -261,8 +281,17 @@ void dswap_
 //    bli_finalize_auto();
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
 }
+#ifdef BLIS_ENABLE_BLAS
+void dswap_
+     (
+       const f77_int* n,
+       double*   x, const f77_int* incx,
+       double*   y, const f77_int* incy
+     )
+{
+    dswap_blis_impl( n, x, incx, y, incy ); 
+}
+#endif
 
 INSERT_GENTFUNC_BLAS_CZ( swap, swapv )
 
-
-#endif
diff --git a/frame/compat/bla_symm.c b/frame/compat/bla_symm.c
index 85aebb435f..f171c495fe 100755
--- a/frame/compat/bla_symm.c
+++ b/frame/compat/bla_symm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -109,6 +109,25 @@ void PASTEF77(ch,blasname) \
 	cs_b = *ldb; \
 	rs_c = 1; \
 	cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  BLIS_DENSE, \
+								  m0, \
+								  n0, \
+								  (ftype*) beta, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -131,14 +150,31 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -199,6 +235,25 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_b = *ldb; \
 	const inc_t rs_c = 1; \
 	const inc_t cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  BLIS_DENSE, \
+								  m0, \
+								  n0, \
+								  (ftype*) beta, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	const num_t   dt     = PASTEMAC(ch,type); \
 \
@@ -246,11 +301,26 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( side, uploa, m, n, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( symm, symm )
-#endif
 
diff --git a/frame/compat/bla_symm.h b/frame/compat/bla_symm.h
index b186e4b436..da87389471 100644
--- a/frame/compat/bla_symm.h
+++ b/frame/compat/bla_symm.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +40,21 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -52,7 +67,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( symm )
-#endif
 
diff --git a/frame/compat/bla_symv.c b/frame/compat/bla_symv.c
index c105be329e..0ecc121646 100755
--- a/frame/compat/bla_symv.c
+++ b/frame/compat/bla_symv.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCRO
 #define GENTFUNCRO( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -112,9 +112,24 @@ void PASTEF77(ch,blasname) \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	/* Finalize BLIS. */ \
 	bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) \
+     ( uploa, m, alpha, a, lda, x, incx, beta, y, incy ); \
+}\
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCRO_BLAS( symv, symv )
-#endif
 
diff --git a/frame/compat/bla_symv.h b/frame/compat/bla_symv.h
index 9d1662fadf..6c868f06f9 100644
--- a/frame/compat/bla_symv.h
+++ b/frame/compat/bla_symv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,20 @@
 #undef  GENTPROTRO
 #define GENTPROTRO( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    beta, \
+             ftype*    y, const f77_int* incy  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -50,7 +64,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    y, const f77_int* incy  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTRO_BLAS( symv )
-#endif
 
diff --git a/frame/compat/bla_syr.c b/frame/compat/bla_syr.c
index 55251ea254..166870fa1a 100644
--- a/frame/compat/bla_syr.c
+++ b/frame/compat/bla_syr.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin.
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc.All Rights Reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc.All Rights Reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCRO
 #define GENTFUNCRO( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -104,9 +104,22 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    x, const f77_int* incx, \
+             ftype*    a, const f77_int* lda  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) \
+     ( uploa, m, alpha, x, incx, a, lda ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCRO_BLAS( syr, syr )
-#endif
 
diff --git a/frame/compat/bla_syr.h b/frame/compat/bla_syr.h
index 0d2a1e0314..799df552d6 100644
--- a/frame/compat/bla_syr.h
+++ b/frame/compat/bla_syr.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,18 @@
 #undef  GENTPROTRO
 #define GENTPROTRO( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    x, const f77_int* incx, \
+             ftype*    a, const f77_int* lda  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -48,7 +60,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    a, const f77_int* lda  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTRO_BLAS( syr )
-#endif
 
diff --git a/frame/compat/bla_syr2.c b/frame/compat/bla_syr2.c
index 047dc64f9c..ece34f1404 100644
--- a/frame/compat/bla_syr2.c
+++ b/frame/compat/bla_syr2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin.
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc.All Rights Reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc.All Rights Reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNCRO
 #define GENTFUNCRO( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -112,8 +112,22 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    y, const f77_int* incy, \
+             ftype*    a, const f77_int* lda  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) \
+     ( uploa, m, alpha, x, incx, y, incy, a, lda ); \
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNCRO_BLAS( syr2, syr2 )
-#endif
diff --git a/frame/compat/bla_syr2.h b/frame/compat/bla_syr2.h
index b458767941..730dd193ed 100644
--- a/frame/compat/bla_syr2.h
+++ b/frame/compat/bla_syr2.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,19 @@
 #undef  GENTPROTRO
 #define GENTPROTRO( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_int*  m, \
+       const ftype*    alpha, \
+       const ftype*    x, const f77_int* incx, \
+       const ftype*    y, const f77_int* incy, \
+             ftype*    a, const f77_int* lda  \
+     );\
+)\
+\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_int*  m, \
@@ -49,7 +62,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    a, const f77_int* lda  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROTRO_BLAS( syr2 )
-#endif
 
diff --git a/frame/compat/bla_syr2k.c b/frame/compat/bla_syr2k.c
index 6a4f31b969..0cf6367537 100644
--- a/frame/compat/bla_syr2k.c
+++ b/frame/compat/bla_syr2k.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin.
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc.All Rights Reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc.All Rights Reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -118,6 +118,26 @@ void PASTEF77(ch,blasname) \
 	cs_b = *ldb; \
 	rs_c = 1; \
 	cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  blis_uploc, \
+								  m0, \
+								  m0, \
+								  (ftype*) beta, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, \
+								  NULL \
+								); \
+		/* Finalize BLIS. */ \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -139,14 +159,31 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -217,6 +254,26 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_b = *ldb; \
 	const inc_t rs_c = 1; \
 	const inc_t cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  blis_uploc, \
+								  m0, \
+								  m0, \
+								  (ftype*) beta, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, \
+								  NULL \
+								); \
+		/* Finalize BLIS. */ \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	const num_t   dt     = PASTEMAC(ch,type); \
 \
@@ -262,11 +319,26 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, b, ldb, beta, c, ldc ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( syr2k, syr2k )
-#endif
 
diff --git a/frame/compat/bla_syr2k.h b/frame/compat/bla_syr2k.h
index 91d9a3acf8..b710c6177b 100644
--- a/frame/compat/bla_syr2k.h
+++ b/frame/compat/bla_syr2k.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +40,21 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    b, const f77_int* ldb, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -52,7 +67,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( syr2k )
-#endif
 
diff --git a/frame/compat/bla_syrk.c b/frame/compat/bla_syrk.c
index 376b23aec9..dc93422146 100644
--- a/frame/compat/bla_syrk.c
+++ b/frame/compat/bla_syrk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin.
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc.All Rights Reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc.All Rights Reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -114,6 +114,25 @@ void PASTEF77(ch,blasname) \
 	cs_a = *lda; \
 	rs_c = 1; \
 	cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  blis_uploc, \
+								  m0, \
+								  m0, \
+								  (ftype*) beta, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		/* Finalize BLIS. */ \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -133,14 +152,30 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -207,6 +242,25 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 	const inc_t rs_c = 1; \
 	const inc_t cs_c = *ldc; \
+\
+	/* If alpha is zero, scale C by beta and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,scalm,_ex)( BLIS_NO_CONJUGATE, \
+								  0, \
+								  BLIS_NONUNIT_DIAG, \
+								  blis_uploc, \
+								  m0, \
+								  m0, \
+								  (ftype*) beta, \
+								  (ftype*) c, rs_c, cs_c, \
+								  NULL, NULL \
+								); \
+		/* Finalize BLIS. */ \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	const num_t   dt     = PASTEMAC(ch,type); \
 \
@@ -245,11 +299,25 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( uploc, transa, m, k, alpha, a, lda, beta, c, ldc ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( syrk, syrk )
-#endif
 
diff --git a/frame/compat/bla_syrk.h b/frame/compat/bla_syrk.h
index b6ca938a6f..06028457e4 100644
--- a/frame/compat/bla_syrk.h
+++ b/frame/compat/bla_syrk.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +40,20 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploc, \
+       const f77_char* transa, \
+       const f77_int*  m, \
+       const f77_int*  k, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+       const ftype*    beta, \
+             ftype*    c, const f77_int* ldc  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploc, \
        const f77_char* transa, \
@@ -51,7 +65,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    c, const f77_int* ldc  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( syrk )
-#endif
 
diff --git a/frame/compat/bla_trmm.c b/frame/compat/bla_trmm.c
index c319b3ab51..a687850332 100644
--- a/frame/compat/bla_trmm.c
+++ b/frame/compat/bla_trmm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin.
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc.All Rights Reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc.All Rights Reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,7 +35,6 @@
 
 #include "blis.h"
 
-
 //
 // Define BLAS-to-BLIS interfaces.
 //
@@ -45,7 +44,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -111,6 +110,24 @@ void PASTEF77(ch,blasname) \
 	cs_a = *lda; \
 	rs_b = 1; \
 	cs_b = *ldb; \
+\
+	/* If alphs is zero, set B to zero and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,setm,_ex)( BLIS_NO_CONJUGATE, \
+								0, \
+								BLIS_NONUNIT_DIAG, \
+								BLIS_DENSE, \
+								m0, n0, \
+								(ftype*) alpha, \
+								(ftype*) b, rs_b, cs_b, \
+								NULL, NULL \
+							 ); \
+		/* Finalize BLIS. */ \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	/* Call BLIS interface. */ \
 	PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -131,14 +148,31 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    b, const f77_int* ldb  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -202,6 +236,24 @@ void PASTEF77(ch,blasname) \
 	const inc_t cs_a = *lda; \
 	const inc_t rs_b = 1; \
 	const inc_t cs_b = *ldb; \
+\
+	/* If alphs is zero, set B to zero and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,setm,_ex)( BLIS_NO_CONJUGATE, \
+								0, \
+								BLIS_NONUNIT_DIAG, \
+								BLIS_DENSE, \
+								m0, n0, \
+								(ftype*) alpha, \
+								(ftype*) b, rs_b, cs_b, \
+								NULL, NULL \
+							 ); \
+		/* Finalize BLIS. */ \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
 	const num_t   dt     = PASTEMAC(ch,type); \
 \
@@ -239,11 +291,26 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    b, const f77_int* ldb  \
+     ) \
+{ \
+	PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( trmm, trmm )
-#endif
 
diff --git a/frame/compat/bla_trmm.h b/frame/compat/bla_trmm.h
index 4f0c20b1b2..be7eb56347 100644
--- a/frame/compat/bla_trmm.h
+++ b/frame/compat/bla_trmm.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,14 +33,27 @@
 
 */
 
-
 //
 // Prototype BLAS-to-BLIS interfaces.
 //
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    b, const f77_int* ldb  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -52,7 +66,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    b, const f77_int* ldb  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( trmm )
-#endif
 
diff --git a/frame/compat/bla_trmv.c b/frame/compat/bla_trmv.c
index 9c98ad787a..18f8901d84 100644
--- a/frame/compat/bla_trmv.c
+++ b/frame/compat/bla_trmv.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin.
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc.All Rights Reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc.All Rights Reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_char* transa, \
@@ -116,9 +116,23 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    x, const f77_int* incx  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) \
+    ( uploa, transa, diaga, m, a, lda, x, incx );\
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( trmv, trmv )
-#endif
 
diff --git a/frame/compat/bla_trmv.h b/frame/compat/bla_trmv.h
index 4096ffe793..69a8565ccc 100644
--- a/frame/compat/bla_trmv.h
+++ b/frame/compat/bla_trmv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,19 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    x, const f77_int* incx  \
+     );\
+\
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_char* transa, \
@@ -49,7 +62,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    x, const f77_int* incx  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( trmv )
-#endif
 
diff --git a/frame/compat/bla_trsm.c b/frame/compat/bla_trsm.c
index e99805d8dd..026fc520ae 100644
--- a/frame/compat/bla_trsm.c
+++ b/frame/compat/bla_trsm.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -130,14 +130,31 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    b, const f77_int* ldb  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \
+} \
+)
 
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -393,10 +410,25 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)  \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    b, const f77_int* ldb  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \
+} \
+)
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( trsm, trsm )
-#endif
diff --git a/frame/compat/bla_trsm.h b/frame/compat/bla_trsm.h
index 5694db52a8..d74340f359 100644
--- a/frame/compat/bla_trsm.h
+++ b/frame/compat/bla_trsm.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,7 +40,21 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    b, const f77_int* ldb  \
+     ); \
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -52,7 +67,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    b, const f77_int* ldb  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( trsm )
-#endif
 
diff --git a/frame/compat/bla_trsm_amd.c b/frame/compat/bla_trsm_amd.c
index 13330a5d08..a4585235a1 100644
--- a/frame/compat/bla_trsm_amd.c
+++ b/frame/compat/bla_trsm_amd.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -45,7 +45,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -110,6 +110,24 @@ void PASTEF77(ch,blasname) \
     cs_a = *lda; \
     rs_b = 1; \
     cs_b = *ldb; \
+\
+	/* If alpha is zero, set B to zero and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,setm,_ex)( BLIS_NO_CONJUGATE, \
+								0, \
+								BLIS_NONUNIT_DIAG, \
+								BLIS_DENSE, \
+								m0, n0, \
+								(ftype*) alpha, \
+								(ftype*) b, rs_b, cs_b, \
+								NULL, NULL \
+							  ); \
+		 AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
     /* Call BLIS interface. */ \
     PASTEMAC2(ch,blisname,BLIS_TAPI_EX_SUF) \
@@ -130,14 +148,30 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO) \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
-
+} \
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    b, const f77_int* ldb  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \
+} \
+)
 #else
 
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* side, \
        const f77_char* uploa, \
@@ -204,6 +238,24 @@ void PASTEF77(ch,blasname) \
     const inc_t rs_b = 1; \
     const inc_t cs_b = *ldb; \
     const num_t dt = PASTEMAC(ch,type); \
+\
+	/* If alpha is zero, set B to zero and return early */ \
+	if( PASTEMAC(ch,eq0)( *alpha ) ) \
+	{ \
+		PASTEMAC2(ch,setm,_ex)( BLIS_NO_CONJUGATE, \
+								0, \
+								BLIS_NONUNIT_DIAG, \
+								BLIS_DENSE, \
+								m0, n0, \
+								(ftype*) alpha, \
+								(ftype*) b, rs_b, cs_b, \
+								NULL, NULL \
+							  ); \
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
+		/* Finalize BLIS. */ \
+		bli_finalize_auto(); \
+		return; \
+	} \
 \
     /* ----------------------------------------------------------- */ \
     /*    TRSM API: AX = B, where X = B                            */ \
@@ -393,13 +445,27 @@ void PASTEF77(ch,blasname) \
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO)  \
     /* Finalize BLIS. */ \
     bli_finalize_auto(); \
-}
+} \
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* side, \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const f77_int*  n, \
+       const ftype*    alpha, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    b, const f77_int* ldb  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb ); \
+} \
 
 #endif
 
-#ifdef BLIS_ENABLE_BLAS
 
-void strsm_
+void strsm_blis_impl
 (
     const f77_char* side,
     const f77_char* uploa,
@@ -468,6 +534,24 @@ void strsm_
     const inc_t cs_b = *ldb;
     const num_t dt = BLIS_FLOAT;
 
+	/* If alpha is zero, set B to zero and return early */
+	if( PASTEMAC(s,eq0)( *alpha ) )
+	{
+		PASTEMAC2(s,setm,_ex)( BLIS_NO_CONJUGATE,
+								0,
+								BLIS_NONUNIT_DIAG,
+								BLIS_DENSE,
+								m0, n0,
+								(float*) alpha,
+								(float*) b, rs_b, cs_b,
+								NULL, NULL
+							  );
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+		/* Finalize BLIS. */
+		bli_finalize_auto();
+		return;
+	}
+
     if( n0 == 1 )
     {
         if( blis_side == BLIS_LEFT )
@@ -622,28 +706,29 @@ void strsm_
     bli_obj_set_struc( struca, &ao );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
 	    /* bli_strsm_small is performing better existing native
 	     * implementations for [m,n]<=1000 for single thread.
-	     * In case of multithread when [m,n]<=128 sinlge thread implemenation
+	     * In case of multithread when [m,n]<=128 single thread implementation
 	     * is doing better than native multithread */
-	    bool nt = bli_thread_get_is_parallel();
-	    if((nt==0 && m0<=1000 && n0<=1000) ||
-			    (nt && (m0+n0)<320) )
+	    bool is_parallel = bli_thread_get_is_parallel();
+	    if((!is_parallel && m0<=1000 && n0<=1000) ||
+               (is_parallel && (m0+n0)<320))
 	    {
 		    err_t status;
 		    status = bli_trsm_small
-                (
-                 blis_side,
-			     &alphao,
-			     &ao,
-			     &bo,
-			     NULL,
-			     NULL
-			    );
+                             (
+                               blis_side,
+			       &alphao,
+			       &ao,
+			       &bo,
+			       NULL,
+			       NULL,
+			       is_parallel
+			     );
 		    if (status == BLIS_SUCCESS)
 		    {
 			    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
@@ -669,8 +754,24 @@ void strsm_
     /* Finalize BLIS. */
     bli_finalize_auto();
 }
-
-void dtrsm_
+#ifdef BLIS_ENABLE_BLAS
+void strsm_
+(
+    const f77_char* side,
+    const f77_char* uploa,
+    const f77_char* transa,
+    const f77_char* diaga,
+    const f77_int*  m,
+    const f77_int*  n,
+    const float*    alpha,
+    const float*    a, const f77_int* lda,
+    float*    b, const f77_int* ldb
+)
+{
+    strsm_blis_impl ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb );
+}
+#endif
+void dtrsm_blis_impl
 (
     const f77_char* side,
     const f77_char* uploa,
@@ -739,6 +840,24 @@ void dtrsm_
     const inc_t cs_b = *ldb;
     const num_t dt = BLIS_DOUBLE;
 
+	/* If alpha is zero, set B to zero and return early */
+	if( PASTEMAC(d,eq0)( *alpha ) )
+	{
+		PASTEMAC2(d,setm,_ex)( BLIS_NO_CONJUGATE,
+								0,
+								BLIS_NONUNIT_DIAG,
+								BLIS_DENSE,
+								m0, n0,
+								(double*) alpha,
+								(double*) b, rs_b, cs_b,
+								NULL, NULL
+							  );
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+		/* Finalize BLIS. */
+		bli_finalize_auto();
+		return;
+	}
+
     if( n0 == 1 )
     {
         if( blis_side == BLIS_LEFT )
@@ -892,37 +1011,108 @@ void dtrsm_
     bli_obj_set_conjtrans( blis_transa, &ao );
 
     bli_obj_set_struc( struca, &ao );
-    
+
 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
-        /* bli_dtrsm_small is performing better existing native
-         * implementations for [m,n]<=1000 for single thread.
-         * In case of multithread when [m,n]<=128 sinlge thread implemenation
-         * is doing better than native multithread */
-        bool nt = bli_thread_get_is_parallel();
-        if ((nt == 0 && m0 <= 1000 && n0 <= 1000) ||
-            (nt && (m0 + n0) < 320))
+        // typedef for trsm small kernel function pointer
+        typedef err_t (*dtrsm_small_ker_ft)
+            (
+              side_t   side,
+              obj_t*   alpha,
+              obj_t*   a,
+              obj_t*   b,
+              cntx_t*  cntx,
+              cntl_t*  cntl,
+              bool     is_parallel
+            );
+        err_t status = BLIS_NOT_YET_IMPLEMENTED;
+        
+        // trsm small kernel function pointer definition
+        dtrsm_small_ker_ft ker_ft = NULL;
+
+        // Query the architecture ID
+        arch_t id = bli_arch_query_id();
+
+        // dimensions of triangular matrix
+        // for left variants, dim_a is m0,
+        // for right variants, dim_a is n0
+        dim_t dim_a = n0;
+        if (blis_side == BLIS_LEFT)
+            dim_a = m0;
+
+        // size of output matrix(B)
+        dim_t size_b = m0*n0;
+
+        /* bli_dtrsm_small is performing better than existing native
+         * implementations for dim_a<1500 and m0*n0<5e6 for single thread.
+         * In case of multithread when [m+n]<320 single thread implementation
+         * is doing better than small multithread and native multithread */
+        bool is_parallel = bli_thread_get_is_parallel();
+        if ((!is_parallel && ((dim_a < 1500) && (size_b < 5e6)) ) ||
+            (is_parallel && (m0+n0)<320))
         {
-            err_t status;
-            status = bli_trsm_small(
-                blis_side,
-                &alphao,
-                &ao,
-                &bo,
-                NULL,
-                NULL);
-            if (status == BLIS_SUCCESS)
+            switch(id)
             {
-                AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
-                /* Finalize BLIS. */
-                bli_finalize_auto();
-                return;
+                case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+                    /* For sizes where m and n < 50,avx2 kernels are performing better,
+                     except for sizes where n is multiple of 8.*/
+                    if (((n0 % 8 == 0) && (n0 < 50)) || ((m0 > 50) && (n0 > 50)))
+                    {
+                        ker_ft = bli_trsm_small_AVX512;
+                    }
+                    else
+                    {
+                        ker_ft = bli_trsm_small;
+                    }
+                    break;
+#endif // BLIS_KERNELS_ZEN4
+                case BLIS_ARCH_ZEN:
+                case BLIS_ARCH_ZEN2:
+                case BLIS_ARCH_ZEN3:
+                default:
+                    ker_ft = bli_trsm_small;
+                    break;
             }
         }
-    } // bli_cpuid_is_avx_supported
+
+#ifdef BLIS_ENABLE_OPENMP
+        if( (ker_ft == NULL) && (is_parallel) &&
+          ((dim_a < 2500) && (size_b < 5e6)) )
+        {
+            switch(id)
+            {
+                case BLIS_ARCH_ZEN4:
+#if defined(BLIS_KERNELS_ZEN4)
+                    ker_ft = bli_trsm_small_mt_AVX512;
+                    break;
+#endif// BLIS_KERNELS_ZEN4
+                case BLIS_ARCH_ZEN:
+                case BLIS_ARCH_ZEN2:
+                case BLIS_ARCH_ZEN3:
+                default:
+                    ker_ft = bli_trsm_small_mt;
+                    break;
+            }
+        }
+
+#endif// BLIS_ENABLE_OPENMP
+        if(ker_ft)
+        {
+            status = ker_ft(blis_side, &alphao, &ao, &bo, NULL, NULL, is_parallel);
+        }
+        if (status == BLIS_SUCCESS)
+        {
+            AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
+            /* Finalize BLIS. */
+            bli_finalize_auto();
+            return;
+        }
+    } // bli_cpuid_is_avx2fma3_supported
 #endif// END of BLIS_ENABLE_SMALL_MATRIX_TRSM
 
     bli_trsmnat
@@ -939,9 +1129,25 @@ void dtrsm_
     /* Finalize BLIS. */
     bli_finalize_auto();
 }
+#ifdef BLIS_ENABLE_BLAS
+void dtrsm_
+(
+    const f77_char* side,
+    const f77_char* uploa,
+    const f77_char* transa,
+    const f77_char* diaga,
+    const f77_int*  m,
+    const f77_int*  n,
+    const double*    alpha,
+    const double*    a, const f77_int* lda,
+    double*    b, const f77_int* ldb
+)
+{
+    dtrsm_blis_impl ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb );
+}
+#endif
 
-
-void ztrsm_
+void ztrsm_blis_impl
 (
     const f77_char* side,
     const f77_char* uploa,
@@ -1010,6 +1216,23 @@ void ztrsm_
     const inc_t cs_b = *ldb;
     const num_t dt = BLIS_DCOMPLEX;
 
+	/* If alpha is zero, set B to zero and return early */
+	if( PASTEMAC(z,eq0)( *alpha ) )
+	{
+		PASTEMAC2(z,setm,_ex)( BLIS_NO_CONJUGATE,
+								0,
+								BLIS_NONUNIT_DIAG,
+								BLIS_DENSE,
+								m0, n0,
+								(dcomplex*) alpha,
+								(dcomplex*) b, rs_b, cs_b,
+								NULL, NULL
+							  );
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+		/* Finalize BLIS. */
+		bli_finalize_auto();
+		return;
+	}
 
     if( n0 == 1 )
     {
@@ -1226,18 +1449,18 @@ void ztrsm_
     bli_obj_set_struc( struca, &ao );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
         /* bli_ztrsm_small is performing better existing native
         * implementations for [m,n]<=1000 for single thread.
-        * In case of multithread when [m,n]<=128 sinlge thread implemenation
+        * In case of multithread when [m,n]<=128 single thread implementation
         * is doing better than native multithread */
-        bool nt = bli_thread_get_is_parallel();
+        bool is_parallel = bli_thread_get_is_parallel();
 
-        if(((nt==0) && (m0<=500) && (n0<=500)) ||
-        (nt && ((m0+n0)<128)))
+        if((!is_parallel && m0<=500 && n0<=500) ||
+           (is_parallel && (m0+n0)<128))
         {
             err_t status;
             status = bli_trsm_small
@@ -1247,7 +1470,8 @@ void ztrsm_
                         &ao,
                         &bo,
                         NULL,
-                        NULL
+                        NULL,
+                        is_parallel
                     );
             if (status == BLIS_SUCCESS)
             {
@@ -1257,8 +1481,8 @@ void ztrsm_
                 return;
             }
         }
-    } // bli_cpuid_is_avx_supported
-#endif
+    } // bli_cpuid_is_avx2fma3_supported
+#endif// END of BLIS_ENABLE_SMALL_MATRIX_TRSM
 
     bli_trsmnat
     (
@@ -1274,9 +1498,25 @@ void ztrsm_
     /* Finalize BLIS. */
     bli_finalize_auto();
 }
+#ifdef BLIS_ENABLE_BLAS
+void ztrsm_
+(
+    const f77_char* side,
+    const f77_char* uploa,
+    const f77_char* transa,
+    const f77_char* diaga,
+    const f77_int*  m,
+    const f77_int*  n,
+    const dcomplex*    alpha,
+    const dcomplex*    a, const f77_int* lda,
+    dcomplex*    b, const f77_int* ldb
+)
+{
+    ztrsm_blis_impl ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb );
+}
+#endif
 
-
-void ctrsm_
+void ctrsm_blis_impl
 (
     const f77_char* side,
     const f77_char* uploa,
@@ -1345,6 +1585,23 @@ void ctrsm_
     const inc_t cs_b = *ldb;
     const num_t dt = BLIS_SCOMPLEX;
 
+	/* If alpha is zero, set B to zero and return early */
+	if( PASTEMAC(c,eq0)( *alpha ) )
+	{
+		PASTEMAC2(c,setm,_ex)( BLIS_NO_CONJUGATE,
+								0,
+								BLIS_NONUNIT_DIAG,
+								BLIS_DENSE,
+								m0, n0,
+								(scomplex*) alpha,
+								(scomplex*) b, rs_b, cs_b,
+								NULL, NULL
+							  );
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1)
+		/* Finalize BLIS. */
+		bli_finalize_auto();
+		return;
+	}
 
     if( n0 == 1 )
     {
@@ -1560,17 +1817,17 @@ void ctrsm_
     bli_obj_set_struc( struca, &ao );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == TRUE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == TRUE)
     {
         /* bli_ztrsm_small is performing better existing native
         * implementations for [m,n]<=1000 for single thread.
-        * In case of multithread when [m,n]<=128 sinlge thread implemenation
+        * In case of multithread when [m,n]<=128 single thread implementation
         * is doing better than native multithread */
-        bool nt = bli_thread_get_is_parallel();
-        if((nt==0 && m0<=1000 && n0<=1000) ||
-        (nt && (m0+n0)<320) )
+        bool is_parallel = bli_thread_get_is_parallel();
+        if((!is_parallel && m0<=1000 && n0<=1000) ||
+           (is_parallel && (m0+n0)<320))
         {
             err_t status;
             status = bli_trsm_small
@@ -1580,7 +1837,8 @@ void ctrsm_
                         &ao,
                         &bo,
                         NULL,
-                        NULL
+                        NULL,
+                        is_parallel
                     );
             if (status == BLIS_SUCCESS)
             {
@@ -1590,7 +1848,7 @@ void ctrsm_
                 return;
             }
         }
-    } // bli_cpuid_is_avx_supported
+    } // bli_cpuid_is_avx2fma3_supported
 #endif
 
     bli_trsmnat
@@ -1607,5 +1865,21 @@ void ctrsm_
     /* Finalize BLIS. */
     bli_finalize_auto();
 }
+#ifdef BLIS_ENABLE_BLAS
+void ctrsm_
+(
+    const f77_char* side,
+    const f77_char* uploa,
+    const f77_char* transa,
+    const f77_char* diaga,
+    const f77_int*  m,
+    const f77_int*  n,
+    const scomplex*    alpha,
+    const scomplex*    a, const f77_int* lda,
+    scomplex*    b, const f77_int* ldb
+)
+{
+    ctrsm_blis_impl ( side, uploa, transa, diaga, m, n, alpha, a, lda, b, ldb );
+}
 
 #endif
diff --git a/frame/compat/bla_trsv.c b/frame/compat/bla_trsv.c
index 8baac6a8ba..22b1b034b9 100644
--- a/frame/compat/bla_trsv.c
+++ b/frame/compat/bla_trsv.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin.
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc.All Rights Reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc.All Rights Reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,7 +42,7 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype, ch, blasname, blisname ) \
 \
-void PASTEF77(ch,blasname) \
+void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_char* transa, \
@@ -116,9 +116,23 @@ void PASTEF77(ch,blasname) \
 	/* Finalize BLIS. */ \
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_1) \
 	bli_finalize_auto(); \
-}
+}\
+\
+IF_BLIS_ENABLE_BLAS(\
+void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    x, const f77_int* incx  \
+     ) \
+{ \
+    PASTEF77S(ch,blasname) \
+    ( uploa, transa, diaga, m, a, lda, x, incx );\
+} \
+)
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTFUNC_BLAS( trsv, trsv )
-#endif
 
diff --git a/frame/compat/bla_trsv.h b/frame/compat/bla_trsv.h
index 6edb435f10..8bfbbbb293 100644
--- a/frame/compat/bla_trsv.h
+++ b/frame/compat/bla_trsv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -39,7 +40,19 @@
 #undef  GENTPROT
 #define GENTPROT( ftype, ch, blasname ) \
 \
+IF_BLIS_ENABLE_BLAS(\
 BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
+     ( \
+       const f77_char* uploa, \
+       const f77_char* transa, \
+       const f77_char* diaga, \
+       const f77_int*  m, \
+       const ftype*    a, const f77_int* lda, \
+             ftype*    x, const f77_int* incx  \
+     );\
+\
+)\
+BLIS_EXPORT_BLAS void PASTEF77S(ch,blasname) \
      ( \
        const f77_char* uploa, \
        const f77_char* transa, \
@@ -49,7 +62,5 @@ BLIS_EXPORT_BLAS void PASTEF77(ch,blasname) \
              ftype*    x, const f77_int* incx  \
      );
 
-#ifdef BLIS_ENABLE_BLAS
 INSERT_GENTPROT_BLAS( trsv )
-#endif
 
diff --git a/frame/compat/bli_blas.h b/frame/compat/bli_blas.h
index 103d64ab27..750692061a 100644
--- a/frame/compat/bli_blas.h
+++ b/frame/compat/bli_blas.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,14 +33,6 @@
 
 */
 
-// If the CBLAS compatibility layer was enabled while the BLAS layer
-// was not enabled, we must enable it here.
-#ifdef BLIS_ENABLE_CBLAS
-#ifndef BLIS_ENABLE_BLAS
-#define BLIS_ENABLE_BLAS
-#endif
-#endif // BLIS_ENABLE_CBLAS
-
 // By default, if the BLAS compatibility layer is enabled, we define
 // (include) all of the BLAS prototypes. However, if the user is
 // #including "blis.h" and also #including another header that also
@@ -52,6 +44,11 @@
 #undef  BLIS_ENABLE_BLAS_DEFS
 #endif
 
+// Hack to always enable this, as disabling it is broken in UTA BLIS
+// as well as here.
+#define BLIS_ENABLE_BLAS_DEFS
+
+
 // Skip prototyping all of the BLAS if the BLAS test drivers are being
 // compiled.
 #ifdef BLIS_VIA_BLASTEST
@@ -64,6 +61,9 @@
 #undef BLIS_ENABLE_BLAS_DEFS
 #endif
 
+
+
+
 // Begin including all BLAS prototypes.
 #ifdef BLIS_ENABLE_BLAS_DEFS
 
@@ -217,4 +217,4 @@
 #include "f77_amin_sub.h"
 
 
-#endif // BLIS_ENABLE_BLAS
+#endif // BLIS_ENABLE_BLAS_DEFS
diff --git a/frame/compat/blis/thread/b77_thread.c b/frame/compat/blis/thread/b77_thread.c
index c864339a36..fa28b959ba 100644
--- a/frame/compat/blis/thread/b77_thread.c
+++ b/frame/compat/blis/thread/b77_thread.c
@@ -55,7 +55,7 @@ void PASTEF770(bli_thread_set_ways)
 	dim_t ir0 = *ir;
 
 	// Initialize BLIS.
-	bli_init_auto();
+	//bli_init_auto();
 
 	// Convert/typecast negative values to zero.
 	//bli_convert_blas_dim1( *jc, jc0 );
@@ -68,7 +68,7 @@ void PASTEF770(bli_thread_set_ways)
 	bli_thread_set_ways( jc0, pc0, ic0, jr0, ir0 );
 
 	// Finalize BLIS.
-	bli_finalize_auto();
+	//bli_finalize_auto();
 }
 
 void PASTEF770(bli_thread_set_num_threads)
@@ -79,7 +79,7 @@ void PASTEF770(bli_thread_set_num_threads)
 	dim_t nt0 = *nt;
 
 	// Initialize BLIS.
-	bli_init_auto();
+	//bli_init_auto();
 
 	// Convert/typecast negative values to zero.
 	//bli_convert_blas_dim1( *nt, nt0 );
@@ -88,6 +88,6 @@ void PASTEF770(bli_thread_set_num_threads)
 	bli_thread_set_num_threads( nt0 );
 
 	// Finalize BLIS.
-	bli_finalize_auto();
+	//bli_finalize_auto();
 }
 
diff --git a/frame/compat/cblas/f77_sub/f77_amax_sub.c b/frame/compat/cblas/f77_sub/f77_amax_sub.c
index cc26196d79..c394ed4d40 100644
--- a/frame/compat/cblas/f77_sub/f77_amax_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_amax_sub.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -42,18 +43,28 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype_x, chx, blasname, blisname ) \
 \
-void PASTEF773(i,chx,blasname,sub) \
+void PASTEF773S(i,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
              f77_int* rval  \
      ) \
 { \
-	*rval = PASTEF772(i,chx,blasname) \
+	*rval = PASTEF772S(i,chx,blasname) \
 	( \
 	  n, \
 	  x, incx \
 	); \
+}\
+\
+void PASTEF773(i,chx,blasname,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx, \
+             f77_int* rval  \
+     ) \
+{ \
+  PASTEF773S(i,chx,blasname,sub) ( n, x, incx, rval );\
 }
 
 #ifdef BLIS_ENABLE_CBLAS
diff --git a/frame/compat/cblas/f77_sub/f77_amax_sub.h b/frame/compat/cblas/f77_sub/f77_amax_sub.h
index 9cd1202d26..35d501ba4a 100644
--- a/frame/compat/cblas/f77_sub/f77_amax_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_amax_sub.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,6 +41,13 @@
 #define GENTPROT( ftype_x, chx, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx, \
+             f77_int* rval  \
+     );\
+\
+BLIS_EXPORT_BLAS void PASTEF773S(i,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
diff --git a/frame/compat/cblas/f77_sub/f77_amin_sub.c b/frame/compat/cblas/f77_sub/f77_amin_sub.c
index 73e1951839..2eaa231061 100644
--- a/frame/compat/cblas/f77_sub/f77_amin_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_amin_sub.c
@@ -4,8 +4,8 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
-
+   Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -42,18 +42,28 @@
 #undef  GENTFUNC
 #define GENTFUNC( ftype_x, chx, blasname, blisname ) \
 \
-void PASTEF773(i,chx,blasname,sub) \
+void PASTEF773S(i,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
              f77_int* rval  \
      ) \
 { \
-  *rval = PASTEF772(i,chx,blasname) \
+  *rval = PASTEF772S(i,chx,blasname) \
   ( \
     n, \
     x, incx \
   ); \
+}\
+\
+void PASTEF773(i,chx,blasname,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx, \
+             f77_int* rval  \
+     ) \
+{ \
+  PASTEF773S(i,chx,blasname,sub) ( n, x, incx, rval );\
 }
 
 #ifdef BLIS_ENABLE_CBLAS
diff --git a/frame/compat/cblas/f77_sub/f77_amin_sub.h b/frame/compat/cblas/f77_sub/f77_amin_sub.h
index 522dcc7938..90b4f25b5f 100644
--- a/frame/compat/cblas/f77_sub/f77_amin_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_amin_sub.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -39,6 +39,13 @@
 #define GENTPROT( ftype_x, chx, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF773(i,chx,blasname,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx, \
+             f77_int* rval  \
+     );\
+\
+BLIS_EXPORT_BLAS void PASTEF773S(i,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
diff --git a/frame/compat/cblas/f77_sub/f77_asum_sub.c b/frame/compat/cblas/f77_sub/f77_asum_sub.c
index f1cb35b0cc..befac150e0 100644
--- a/frame/compat/cblas/f77_sub/f77_asum_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_asum_sub.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -42,18 +43,28 @@
 #undef  GENTFUNCR2
 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
 \
-void PASTEF773(chr,chx,blasname,sub) \
+void PASTEF773S(chr,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
              ftype_r* rval  \
      ) \
 { \
-	*rval = PASTEF772(chr,chx,blasname) \
+	*rval = PASTEF772S(chr,chx,blasname) \
 	( \
 	  n, \
 	  x, incx \
 	); \
+}\
+\
+void PASTEF773(chr,chx,blasname,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx, \
+             ftype_r* rval  \
+     ) \
+{ \
+	PASTEF773S(chr,chx,blasname,sub) ( n, x, incx, rval ); \
 }
 
 #ifdef BLIS_ENABLE_CBLAS
diff --git a/frame/compat/cblas/f77_sub/f77_asum_sub.h b/frame/compat/cblas/f77_sub/f77_asum_sub.h
index 4b8634c166..de3d99bfc9 100644
--- a/frame/compat/cblas/f77_sub/f77_asum_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_asum_sub.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,6 +41,13 @@
 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx, \
+             ftype_r* rval  \
+     );\
+\
+BLIS_EXPORT_BLAS void PASTEF773S(chr,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
diff --git a/frame/compat/cblas/f77_sub/f77_dot_sub.c b/frame/compat/cblas/f77_sub/f77_dot_sub.c
index 0ca80464d3..f497ab97f0 100644
--- a/frame/compat/cblas/f77_sub/f77_dot_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_dot_sub.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -43,7 +44,7 @@
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-void PASTEF773(ch,blasname,chc,sub) \
+void PASTEF773S(ch,blasname,chc,sub) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -51,12 +52,23 @@ void PASTEF773(ch,blasname,chc,sub) \
              ftype*   rval  \
      ) \
 { \
-	*rval = PASTEF772(ch,blasname,chc) \
+	*rval = PASTEF772S(ch,blasname,chc) \
 	( \
 	  n, \
 	  x, incx, \
 	  y, incy \
 	); \
+}\
+\
+void PASTEF773(ch,blasname,chc,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy, \
+             ftype*   rval  \
+     ) \
+{ \
+  PASTEF773S(ch,blasname,chc,sub)( n, x, incx, y, incy, rval); \
 }
 
 INSERT_GENTFUNCDOTR_BLAS( dot, NULL )
@@ -75,7 +87,7 @@ INSERT_GENTFUNCDOTC_BLAS( dot, NULL )
 #undef  GENTFUNCDOT
 #define GENTFUNCDOT( ftype, ch, chc, blis_conjx, blasname, blisname ) \
 \
-void PASTEF773(ch,blasname,chc,sub) \
+void PASTEF773S(ch,blasname,chc,sub) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -90,6 +102,17 @@ void PASTEF773(ch,blasname,chc,sub) \
 	  x, incx, \
 	  y, incy \
 	); \
+}\
+\
+void PASTEF773(ch,blasname,chc,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy, \
+             ftype*   rval  \
+     ) \
+{ \
+  PASTEF773S(ch,blasname,chc,sub)( n, x, incx, y, incy, rval); \
 }
 
 INSERT_GENTFUNCDOTC_BLAS( dot, NULL )
@@ -100,7 +123,7 @@ INSERT_GENTFUNCDOTC_BLAS( dot, NULL )
 
 // Input vectors stored in single precision, computed in double precision,
 // with result returned in single precision.
-void PASTEF772(sds,dot,sub)
+void PASTEF772S(sds,dot,sub)
      (
        const f77_int* n,
        const float*  sb,
@@ -109,7 +132,7 @@ void PASTEF772(sds,dot,sub)
              float*   rval
      )
 {
-	*rval = PASTEF77(sds,dot)
+	*rval = PASTEF77S(sds,dot)
 	(
 	  n,
 	  sb,
@@ -117,10 +140,21 @@ void PASTEF772(sds,dot,sub)
 	  y, incy
 	);
 }
+void PASTEF772(sds,dot,sub)
+     (
+       const f77_int* n,
+       const float*  sb,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy,
+             float*   rval
+     )
+{
+  PASTEF772S(sds,dot,sub)( n, sb, x, incx, y, incy, rval);
+}
 
 // Input vectors stored in single precision, computed in double precision,
 // with result returned in double precision.
-void PASTEF772(ds,dot,sub)
+void PASTEF772S(ds,dot,sub)
      (
        const f77_int* n,
        const float*   x, const f77_int* incx,
@@ -128,13 +162,23 @@ void PASTEF772(ds,dot,sub)
              double*  rval
      )
 {
-	*rval = PASTEF77(ds,dot)
+	*rval = PASTEF77S(ds,dot)
 	(
 	  n,
 	  x, incx,
 	  y, incy
 	);
 }
+void PASTEF772(ds,dot,sub)
+     (
+       const f77_int* n,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy,
+             double*  rval
+     )
+{
+  PASTEF772S(ds,dot,sub)( n, x, incx, y, incy, rval);
+}
 
 #endif
 
diff --git a/frame/compat/cblas/f77_sub/f77_dot_sub.h b/frame/compat/cblas/f77_sub/f77_dot_sub.h
index 8aab2728bf..54a40a9a02 100644
--- a/frame/compat/cblas/f77_sub/f77_dot_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_dot_sub.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,6 +41,14 @@
 #define GENTPROTDOT( ftype, ch, chc, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF773(ch,blasname,chc,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype*   x, const f77_int* incx, \
+       const ftype*   y, const f77_int* incy, \
+             ftype*   rval  \
+     );\
+\
+BLIS_EXPORT_BLAS void PASTEF773S(ch,blasname,chc,sub) \
      ( \
        const f77_int* n, \
        const ftype*   x, const f77_int* incx, \
@@ -61,7 +70,15 @@ BLIS_EXPORT_BLAS void PASTEF772(sds,dot,sub)
        const float*   y, const f77_int* incy,
              float*   rval
      );
-
+BLIS_EXPORT_BLAS void PASTEF772S(sds,dot,sub)
+     (
+       const f77_int* n,
+       const float*  sb,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy,
+             float*   rval
+     );
+     
 BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub)
      (
        const f77_int* n,
@@ -69,4 +86,12 @@ BLIS_EXPORT_BLAS void PASTEF772(ds,dot,sub)
        const float*   y, const f77_int* incy,
              double*  rval
      );
+BLIS_EXPORT_BLAS void PASTEF772S(ds,dot,sub)
+     (
+       const f77_int* n,
+       const float*   x, const f77_int* incx,
+       const float*   y, const f77_int* incy,
+             double*  rval
+     );
+
 #endif
diff --git a/frame/compat/cblas/f77_sub/f77_nrm2_sub.c b/frame/compat/cblas/f77_sub/f77_nrm2_sub.c
index 54ce1a5b49..72fa07593a 100644
--- a/frame/compat/cblas/f77_sub/f77_nrm2_sub.c
+++ b/frame/compat/cblas/f77_sub/f77_nrm2_sub.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -42,18 +43,28 @@
 #undef  GENTFUNCR2
 #define GENTFUNCR2( ftype_x, ftype_r, chx, chr, blasname, blisname ) \
 \
-void PASTEF773(chr,chx,blasname,sub) \
+void PASTEF773S(chr,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
              ftype_r* rval  \
      ) \
 { \
-	*rval = PASTEF772(chr,chx,blasname) \
+	*rval = PASTEF772S(chr,chx,blasname) \
 	( \
 	  n, \
 	  x, incx \
 	); \
+}\
+\
+void PASTEF773(chr,chx,blasname,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx, \
+             ftype_r* rval  \
+     ) \
+{ \
+	 PASTEF773S(chr,chx,blasname,sub)( n, x, incx, rval );\
 }
 
 #ifdef BLIS_ENABLE_CBLAS
diff --git a/frame/compat/cblas/f77_sub/f77_nrm2_sub.h b/frame/compat/cblas/f77_sub/f77_nrm2_sub.h
index c51a94292b..dbe2809741 100644
--- a/frame/compat/cblas/f77_sub/f77_nrm2_sub.h
+++ b/frame/compat/cblas/f77_sub/f77_nrm2_sub.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,6 +41,13 @@
 #define GENTPROTR2( ftype_x, ftype_r, chx, chr, blasname ) \
 \
 BLIS_EXPORT_BLAS void PASTEF773(chr,chx,blasname,sub) \
+     ( \
+       const f77_int* n, \
+       const ftype_x* x, const f77_int* incx, \
+             ftype_r* rval  \
+     );\
+\
+BLIS_EXPORT_BLAS void PASTEF773S(chr,chx,blasname,sub) \
      ( \
        const f77_int* n, \
        const ftype_x* x, const f77_int* incx, \
diff --git a/frame/compat/cblas/src/cblas.h b/frame/compat/cblas/src/cblas.h
index 8bef3d7f8d..dcccb07baa 100644
--- a/frame/compat/cblas/src/cblas.h
+++ b/frame/compat/cblas/src/cblas.h
@@ -1,6 +1,6 @@
 /*
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -488,12 +488,57 @@ void BLIS_EXPORT_BLAS cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                  float alpha, const float *A, f77_int lda,
                  float *B, f77_int ldb);
+/** \addtogroup APIS BLIS Extension API
+ *  @{
+ */
+
+/** \addtogroup INTERFACE CBLAS INTERFACE
+ * \ingroup BLIS Extension API
+ *  @{
+ */
+
+
+/**
+* sgemmt computes scalar-matrix-matrix product with general matrices. It adds the result to the upper or lower part of scalar-matrix product.
+* It accesses and updates a triangular part of the square result matrix.
+* The operation is defined as
+* C := alpha*Mat(A) * Mat(B) + beta*C,
+* where:
+* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
+* alpha and beta are scalars,
+* A, B and C are matrices:
+* Mat(A) is an nxk matrix,
+* Mat(B) is a kxn matrix,
+* C is an nxn upper or lower triangular matrix.
+*
+* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor
+* @param[in] Uplo Specifies whether the upper or lower triangular part of the array c is used. CblasUpper or CblasLower
+* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
+* if transa = CblasNoTrans, then Mat(A) = A;
+* if transa = CblasTrans, then Mat(A) =\f$A^T\f$;
+* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$.
+* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
+* if transb = CblasNoTrans, then Mat(B) = B;
+* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
+* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$.
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @param[in] alpha Specifies the scalar alpha.
+* @param[in] A  The array is float matrix A.
+* @param[in] lda Specifies the leading dimension of a
+* @param[in] B The array is float matrix B.
+* @param[in] ldb Specifies the leading dimension of b
+* @param[in] beta Specifies the scalar beta.
+* @param[in,out] C The array is float matrix C.
+* @param[in] ldc Specifies the leading dimension of c
+* @return None
+*/
 void BLIS_EXPORT_BLAS cblas_sgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
          enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
          f77_int N, f77_int K, float alpha, const float *A,
                  f77_int lda, const float *B, f77_int ldb,
                  float beta, float *C, f77_int ldc);
-
+/** @}*/
 void BLIS_EXPORT_BLAS cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
                  f77_int K, double alpha, const double *A,
@@ -523,12 +568,51 @@ void BLIS_EXPORT_BLAS cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                  double alpha, const double *A, f77_int lda,
                  double *B, f77_int ldb);
+/** \addtogroup INTERFACE CBLAS INTERFACE
+ *  @{
+ */
+
+/**
+* dgemmt computes scalar-matrix-matrix product with general matrices. It adds the result to the upper or lower part of scalar-matrix product.
+* It accesses and updates a triangular part of the square result matrix.
+* The operation is defined as
+* C := alpha*Mat(A) * Mat(B) + beta*C,
+* where:
+* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
+* alpha and beta are scalars,
+* A, B and C are matrices:
+* Mat(A) is an nxk matrix,
+* Mat(B) is a kxn matrix,
+* C is an nxn upper or lower triangular matrix.
+*
+* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor
+* @param[in] Uplo Specifies whether the upper or lower triangular part of the array c is used. CblasUpper or CblasLower
+* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
+* if transa = CblasNoTrans, then Mat(A) = A;
+* if transa = CblasTrans, then Mat(A) =\f$A^T\f$;
+* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$.
+* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
+* if transb = CblasNoTrans, then Mat(B) = B;
+* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
+* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$.
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @param[in] alpha Specifies the scalar alpha.
+* @param[in] A  The array is float matrix A.
+* @param[in] lda Specifies the leading dimension of a
+* @param[in] B The array is float matrix B.
+* @param[in] ldb Specifies the leading dimension of b
+* @param[in] beta Specifies the scalar beta.
+* @param[in,out] C The array is float matrix C.
+* @param[in] ldc Specifies the leading dimension of c
+* @return None
+*/
 void BLIS_EXPORT_BLAS cblas_dgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
          enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
          f77_int N, f77_int K, double alpha, const double *A,
                  f77_int lda, const double *B, f77_int ldb,
                  double beta, double *C, f77_int ldc);
-
+/** @}*/
 void BLIS_EXPORT_BLAS cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
                  f77_int K, const void *alpha, const void *A,
@@ -558,12 +642,51 @@ void BLIS_EXPORT_BLAS cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                  const void *alpha, const void *A, f77_int lda,
                  void *B, f77_int ldb);
+/** \addtogroup INTERFACE CBLAS INTERFACE
+ *  @{
+ */
+
+/**
+* cgemmt computes scalar-matrix-matrix product with general matrices. It adds the result to the upper or lower part of scalar-matrix product.
+* It accesses and updates a triangular part of the square result matrix.
+* The operation is defined as
+* C := alpha*Mat(A) * Mat(B) + beta*C,
+* where:
+* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
+* alpha and beta are scalars,
+* A, B and C are matrices:
+* Mat(A) is an nxk matrix,
+* Mat(B) is a kxn matrix,
+* C is an nxn upper or lower triangular matrix.
+*
+* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor
+* @param[in] Uplo Specifies whether the upper or lower triangular part of the array c is used. CblasUpper or CblasLower
+* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
+* if transa = CblasNoTrans, then Mat(A) = A;
+* if transa = CblasTrans, then Mat(A) =\f$A^T\f$;
+* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$.
+* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
+* if transb = CblasNoTrans, then Mat(B) = B;
+* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
+* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$.
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @param[in] alpha Specifies the scalar alpha.
+* @param[in] A  The array is float matrix A.
+* @param[in] lda Specifies the leading dimension of a
+* @param[in] B The array is float matrix B.
+* @param[in] ldb Specifies the leading dimension of b
+* @param[in] beta Specifies the scalar beta.
+* @param[in,out] C The array is float matrix C.
+* @param[in] ldc Specifies the leading dimension of c
+* @return None
+*/
 void BLIS_EXPORT_BLAS cblas_cgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
          enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
          f77_int N, f77_int K, const void *alpha, const void *A,
                  f77_int lda, const void *B, f77_int ldb,
                  const void *beta, void *C, f77_int ldc);
-
+/** @}*/
 void BLIS_EXPORT_BLAS cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
                  f77_int K, const void *alpha, const void *A,
@@ -593,12 +716,51 @@ void BLIS_EXPORT_BLAS cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side,
                  enum CBLAS_DIAG Diag, f77_int M, f77_int N,
                  const void *alpha, const void *A, f77_int lda,
                  void *B, f77_int ldb);
+/** \addtogroup INTERFACE CBLAS INTERFACE
+ *  @{
+ */
+
+/**
+* zgemmt computes scalar-matrix-matrix product with general matrices. It adds the result to the upper or lower part of scalar-matrix product.
+* It accesses and updates a triangular part of the square result matrix.
+* The operation is defined as
+* C := alpha*Mat(A) * Mat(B) + beta*C,
+* where:
+* Mat(X) is one of Mat(X) = X, or Mat(X) = \f$X^T\f$, or Mat(X) = \f$X^H\f$,
+* alpha and beta are scalars,
+* A, B and C are matrices:
+* Mat(A) is an nxk matrix,
+* Mat(B) is a kxn matrix,
+* C is an nxn upper or lower triangular matrix.
+*
+* @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor
+* @param[in] Uplo Specifies whether the upper or lower triangular part of the array c is used. CblasUpper or CblasLower
+* @param[in] TransA Specifies the form of Mat(A) used in the matrix multiplication:
+* if transa = CblasNoTrans, then Mat(A) = A;
+* if transa = CblasTrans, then Mat(A) =\f$A^T\f$;
+* if transa = CblasConjTrans, then Mat(A) = \f$A^H\f$.
+* @param[in] TransB Specifies the form of Mat(B) used in the matrix multiplication:
+* if transb = CblasNoTrans, then Mat(B) = B;
+* if transb = CblasTrans, then Mat(B) = \f$B^T\f$;
+* if transb = CblasConjTrans, then Mat(B) = \f$B^H\f$.
+* @param[in] N Specifies the order of the matrix C.
+* @param[in] K Specifies the number of columns of the matrix Mat(A) and the number of rows of the matrix Mat(B).
+* @param[in] alpha Specifies the scalar alpha.
+* @param[in] A  The array is float matrix A.
+* @param[in] lda Specifies the leading dimension of a
+* @param[in] B The array is float matrix B.
+* @param[in] ldb Specifies the leading dimension of b
+* @param[in] beta Specifies the scalar beta.
+* @param[in,out] C The array is float matrix C.
+* @param[in] ldc Specifies the leading dimension of c
+* @return None
+*/
 void BLIS_EXPORT_BLAS cblas_zgemmt(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo,
          enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB,
          f77_int N, f77_int K, const void *alpha, const void *A,
                  f77_int lda, const void *B, f77_int ldb,
                  const void *beta, void *C, f77_int ldc);
-
+/** @}*/
 
 /*
  * Routines with prefixes C and Z only
@@ -652,6 +814,40 @@ BLIS_EXPORT_BLAS double  cblas_dcabs1( const void *z);
  */
 
 // -- Batch APIs -------
+/** \addtogroup INTERFACE CBLAS INTERFACE
+ *  @{
+ */
+
+/**
+ * cblas_sgemm_batch interface resembles the GEMM interface.
+ * Arguments are arrays of pointers to matrices and parameters.
+ * It batches multiple independent small GEMM operations of fixed or variable sizes into a group
+ * and then spawn multiple threads for different GEMM instances within the group.
+ *
+ * @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor
+ * @param[in] TransA_array Array of pointers, dimension (group_count), specifies the form of Mat( A ) to be used in the matrix multiplication as follows:
+ *                     Mat( A ) = A
+ *                     Mat( A ) = \f$A^T\f$
+ *                     Mat( A ) = \f$A^H\f$
+ * @param[in] TransB_array Array of pointers, dimension (group_count), specifies the form of Mat( B ) to be used in the matrix multiplication as follows:
+ *                     Mat( B ) = B
+ *                     Mat( B ) = \f$B^T\f$
+ *                     Mat( B ) = \f$B^H\f$
+ * @param[in] M_array Array of pointers, dimension (group_count), each is a number of rows of matrices A and of matrices C.
+ * @param[in] N_array Array of pointers, dimension (group_count), each is a number of columns of matrices B and of matrices C.
+ * @param[in] K_array Array of pointers, dimension (group_count), each is a number of columns of matrices A and number of rows of matrices B.
+ * @param[in] alpha_array Array of pointers, dimension (group_count) each is a scalar alpha for each GEMM.
+ * @param[in] A Array of pointers, dimension (group_count), Each is a matrix A of float datatype.
+ * @param[in] lda_array Array of pointers, dimension (group_count), each f77_int lda_array specifies the first dimension of matrix A.
+ * @param[in] B Array of pointers, dimension (group_count), Each is a matrix B of float datatype.
+ * @param[in] ldb_array Array of pointers, dimension (group_count), each f77_int ldb_array specifies the first dimension of matrix B.
+ * @param[in] beta_array Array of pointers, dimension (group_count) each is a scalar beta for each GEMM.
+ * @param[in,out] C Array of pointers, dimension (group_count), Each is a matrix C of float datatype.
+ * @param[in] ldc_array Array of pointers, dimension (group_count), each f77_int ldc_array specifies the first dimension of matrix C.
+ * @param[in] group_count group_count specifies total number of groups. Usually it is used for having batch of variable size GEMM. Where each group batches GEMMs of some fixed size.
+ * @param[in] group_size Array of pointer, each is number of GEMM to be performed per group(batch).
+ * @return None
+ */
 void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order,
                  enum CBLAS_TRANSPOSE *TransA_array,
                  enum CBLAS_TRANSPOSE *TransB_array,
@@ -660,6 +856,37 @@ void BLIS_EXPORT_BLAS cblas_sgemm_batch(enum CBLAS_ORDER Order,
                  f77_int *lda_array, const float **B, f77_int *ldb_array,
                  const float *beta_array, float **C, f77_int *ldc_array,
                  f77_int group_count, f77_int *group_size);
+
+/**
+ * cblas_dgemm_batch interface resembles the GEMM interface.
+ * Arguments are arrays of pointers to matrices and parameters.
+ * It batches multiple independent small GEMM operations of fixed or variable sizes into a group
+ * and then spawn multiple threads for different GEMM instances within the group.
+ *
+ * @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor
+ * @param[in] TransA_array Array of pointers, dimension (group_count), specifies the form of Mat( A ) to be used in the matrix multiplication as follows:
+ *                     Mat( A ) = A
+ *                     Mat( A ) = \f$A^T\f$
+ *                     Mat( A ) = \f$A^H\f$
+ * @param[in] TransB_array Array of pointers, dimension (group_count), specifies the form of Mat( B ) to be used in the matrix multiplication as follows:
+ *                     Mat( B ) = B
+ *                     Mat( B ) = \f$B^T\f$
+ *                     Mat( B ) = \f$B^H\f$
+ * @param[in] M_array Array of pointers, dimension (group_count), each is a number of rows of matrices A and of matrices C.
+ * @param[in] N_array Array of pointers, dimension (group_count), each is a number of columns of matrices B and of matrices C.
+ * @param[in] K_array Array of pointers, dimension (group_count), each is a number of columns of matrices A and number of rows of matrices B.
+ * @param[in] alpha_array Array of pointers, dimension (group_count) each is a scalar alpha for each GEMM.
+ * @param[in] A Array of pointers, dimension (group_count), Each is a matrix A of double datatype.
+ * @param[in] lda_array Array of pointers, dimension (group_count), each f77_int lda_array specifies the first dimension of matrix A.
+ * @param[in] B Array of pointers, dimension (group_count), Each is a matrix B of double datatype.
+ * @param[in] ldb_array Array of pointers, dimension (group_count), each f77_int ldb_array specifies the first dimension of matrix B.
+ * @param[in] beta_array Array of pointers, dimension (group_count) each is a scalar beta for each GEMM.
+ * @param[in,out] C Array of pointers, dimension (group_count), Each is a matrix C of double datatype.
+ * @param[in] ldc_array Array of pointers, dimension (group_count), each f77_int ldc_array specifies the first dimension of matrix C.
+ * @param[in] group_count group_count specifies total number of groups. Usually it is used for having batch of variable size GEMM. Where each group batches GEMMs of some fixed size.
+ * @param[in] group_size Array of pointer, each is number of GEMM to be performed per group(batch).
+ * @return None
+ */
 void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order,
                  enum CBLAS_TRANSPOSE *TransA_array,
                  enum CBLAS_TRANSPOSE *TransB_array,
@@ -669,6 +896,38 @@ void BLIS_EXPORT_BLAS cblas_dgemm_batch(enum CBLAS_ORDER Order,
                  const double **B, f77_int *ldb_array,
                  const double *beta_array, double **C, f77_int *ldc_array,
                  f77_int group_count, f77_int *group_size);
+
+/**
+ * cblas_cgemm_batch interface resembles the GEMM interface.
+ * Arguments are arrays of pointers to matrices and parameters.
+ * It batches multiple independent small GEMM operations of fixed or variable sizes into a group
+ * and then spawn multiple threads for different GEMM instances within the group.
+ *
+ * @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor
+ * @param[in] TransA_array Array of pointers, dimension (group_count), specifies the form of Mat( A ) to be used in the matrix multiplication as follows:
+ *                     Mat( A ) = A
+ *                     Mat( A ) = \f$A^T\f$
+ *                     Mat( A ) = \f$A^H\f$
+ * @param[in] TransB_array Array of pointers, dimension (group_count), specifies the form of Mat( B ) to be used in the matrix multiplication as follows:
+ *                     Mat( B ) = B
+ *                     Mat( B ) = \f$B^T\f$
+ *                     Mat( B ) = \f$B^H\f$
+ * @param[in] M_array Array of pointers, dimension (group_count), each is a number of rows of matrices A and of matrices C.
+ * @param[in] N_array Array of pointers, dimension (group_count), each is a number of columns of matrices B and of matrices C.
+ * @param[in] K_array Array of pointers, dimension (group_count), each is a number of columns of matrices A and number of rows of matrices B.
+ * @param[in] alpha_array Array of pointers, dimension (group_count) each is a scalar alpha for each GEMM.
+ * @param[in] A Array of pointers, dimension (group_count), Each is a matrix A of scomplex datatype.
+ * @param[in] lda_array Array of pointers, dimension (group_count), each f77_int lda_array specifies the first dimension of matrix A.
+ * @param[in] B Array of pointers, dimension (group_count), Each is a matrix B of scomplex datatype.
+ * @param[in] ldb_array Array of pointers, dimension (group_count), each f77_int ldb_array specifies the first dimension of matrix B.
+ * @param[in] beta_array Array of pointers, dimension (group_count) each is a scalar beta for each GEMM.
+ * @param[in,out] C Array of pointers, dimension (group_count), Each is a matrix C of scomplex datatype.
+ * @param[in] ldc_array Array of pointers, dimension (group_count), each f77_int ldc_array specifies the first dimension of matrix C.
+ * @param[in] group_count group_count specifies total number of groups. Usually it is used for having batch of variable size GEMM. Where each group batches GEMMs of some fixed size.
+ * @param[in] group_size Array of pointer, each is number of GEMM to be performed per group(batch).
+ * @return None
+ */
+
 void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order,
                  enum CBLAS_TRANSPOSE *TransA_array,
                  enum CBLAS_TRANSPOSE *TransB_array,
@@ -677,6 +936,37 @@ void BLIS_EXPORT_BLAS cblas_cgemm_batch(enum CBLAS_ORDER Order,
                  f77_int *lda_array, const void **B, f77_int *ldb_array,
                  const void *beta_array, void **C, f77_int *ldc_array,
                  f77_int group_count, f77_int *group_size);
+
+ /**
+ * cblas_zgemm_batch interface resembles the GEMM interface.
+ * Arguments are arrays of pointers to matrices and parameters.
+ * It batches multiple independent small GEMM operations of fixed or variable sizes into a group
+ * and then spawn multiple threads for different GEMM instances within the group.
+ *
+ * @param[in] Order Storage scheme of matrices. CblasRowMajor or CblasColMajor
+ * @param[in] TransA_array Array of pointers, dimension (group_count), specifies the form of Mat( A ) to be used in the matrix multiplication as follows:
+ *                     Mat( A ) = A
+ *                     Mat( A ) = \f$A^T\f$
+ *                     Mat( A ) = \f$A^H\f$
+ * @param[in] TransB_array Array of pointers, dimension (group_count), specifies the form of Mat( B ) to be used in the matrix multiplication as follows:
+ *                     Mat( B ) = B
+ *                     Mat( B ) = \f$B^T\f$
+ *                     Mat( B ) = \f$B^H\f$
+ * @param[in] M_array Array of pointers, dimension (group_count), each is a number of rows of matrices A and of matrices C.
+ * @param[in] N_array Array of pointers, dimension (group_count), each is a number of columns of matrices B and of matrices C.
+ * @param[in] K_array Array of pointers, dimension (group_count), each is a number of columns of matrices A and number of rows of matrices B.
+ * @param[in] alpha_array Array of pointers, dimension (group_count) each is a scalar alpha for each GEMM.
+ * @param[in] A Array of pointers, dimension (group_count), Each is a matrix A of dcomplex datatype.
+ * @param[in] lda_array Array of pointers, dimension (group_count), each f77_int lda_array specifies the first dimension of matrix A.
+ * @param[in] B Array of pointers, dimension (group_count), Each is a matrix B of dcomplex datatype.
+ * @param[in] ldb_array Array of pointers, dimension (group_count), each f77_int ldb_array specifies the first dimension of matrix B.
+ * @param[in] beta_array Array of pointers, dimension (group_count) each is a scalar beta for each GEMM.
+ * @param[in,out] C Array of pointers, dimension (group_count), Each is a matrix C of dcomplex datatype.
+ * @param[in] ldc_array Array of pointers, dimension (group_count), each f77_int ldc_array specifies the first dimension of matrix C.
+ * @param[in] group_count group_count specifies total number of groups. Usually it is used for having batch of variable size GEMM. Where each group batches GEMMs of some fixed size.
+ * @param[in] group_size Array of pointer, each is number of GEMM to be performed per group(batch).
+ * @return None
+ */
 void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order,
                  enum CBLAS_TRANSPOSE *TransA_array,
                  enum CBLAS_TRANSPOSE *TransB_array,
@@ -685,6 +975,7 @@ void BLIS_EXPORT_BLAS cblas_zgemm_batch(enum CBLAS_ORDER Order,
                  f77_int *lda_array, const void **B, f77_int *ldb_array,
                  const void *beta_array, void **C, f77_int *ldc_array,
                  f77_int group_count, f77_int *group_size);
+/** @}*/
 void BLIS_EXPORT_BLAS cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA,
                  enum CBLAS_TRANSPOSE TransB, f77_int M, f77_int N,
                  f77_int K, const void *alpha, const void *A,
diff --git a/frame/compat/cblas/src/cblas_dcabs1.c b/frame/compat/cblas/src/cblas_dcabs1.c
index 63d25cda08..62f568e247 100644
--- a/frame/compat/cblas/src/cblas_dcabs1.c
+++ b/frame/compat/cblas/src/cblas_dcabs1.c
@@ -49,4 +49,4 @@ double cblas_dcabs1( const void *z )
 {
     return F77_dcabs1( (dcomplex*)z );
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/cblas/src/cblas_f77.h b/frame/compat/cblas/src/cblas_f77.h
index fabf3efb1c..be02986ae7 100644
--- a/frame/compat/cblas/src/cblas_f77.h
+++ b/frame/compat/cblas/src/cblas_f77.h
@@ -7,7 +7,7 @@
  *
  * (Heavily hacked down from the original)
  *
- * Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
  *
  */
 
@@ -172,6 +172,7 @@
 #define F77_sgemmt  sgemmt
 #define F77_cgemmt  cgemmt
 #define F77_zgemmt  zgemmt
+#define F77_dzgemm  dzgemm
 
 /*
 * Aux Function
@@ -207,159 +208,159 @@
  * Level 1 BLAS
  */
 #define F77_xerbla     xerbla_
-#define F77_srotg      srotg_
-#define F77_srotmg     srotmg_
-#define F77_srot       srot_
-#define F77_srotm      srotm_
-#define F77_drotg      drotg_
-#define F77_drotmg     drotmg_
-#define F77_drot       drot_
-#define F77_drotm      drotm_
-#define F77_sswap      sswap_
-#define F77_scopy      scopy_
-#define F77_saxpy      saxpy_
-#define F77_isamax_sub isamaxsub_
-#define F77_dswap      dswap_
-#define F77_dcopy      dcopy_
-#define F77_daxpy      daxpy_
-#define F77_idamax_sub idamaxsub_
-#define F77_cswap      cswap_
-#define F77_ccopy      ccopy_
-#define F77_caxpy      caxpy_
-#define F77_icamax_sub icamaxsub_
-#define F77_zswap      zswap_
-#define F77_zcopy      zcopy_
-#define F77_zaxpy      zaxpy_
-#define F77_zaxpby     zaxpby_
-#define F77_izamax_sub izamaxsub_
-#define F77_sdot_sub   sdotsub_
-#define F77_ddot_sub   ddotsub_
-#define F77_dsdot_sub  dsdotsub_
-#define F77_sscal      sscal_
-#define F77_dscal      dscal_
-#define F77_cscal      cscal_
-#define F77_zscal      zscal_
-#define F77_csscal     csscal_
-#define F77_zdscal     zdscal_
-#define F77_cdotu_sub  cdotusub_
-#define F77_cdotc_sub  cdotcsub_
-#define F77_zdotu_sub  zdotusub_
-#define F77_zdotc_sub  zdotcsub_
-#define F77_snrm2_sub  snrm2sub_
-#define F77_sasum_sub  sasumsub_
-#define F77_dnrm2_sub  dnrm2sub_
-#define F77_dasum_sub  dasumsub_
-#define F77_scnrm2_sub scnrm2sub_
-#define F77_scasum_sub scasumsub_
-#define F77_dznrm2_sub dznrm2sub_
-#define F77_dzasum_sub dzasumsub_
-#define F77_sdsdot_sub sdsdotsub_
+#define F77_srotg      srotg_blis_impl
+#define F77_srotmg     srotmg_blis_impl
+#define F77_srot       srot_blis_impl
+#define F77_srotm      srotm_blis_impl
+#define F77_drotg      drotg_blis_impl
+#define F77_drotmg     drotmg_blis_impl
+#define F77_drot       drot_blis_impl
+#define F77_drotm      drotm_blis_impl
+#define F77_sswap      sswap_blis_impl
+#define F77_scopy      scopy_blis_impl
+#define F77_saxpy      saxpy_blis_impl
+#define F77_isamax_sub isamaxsub_blis_impl
+#define F77_dswap      dswap_blis_impl
+#define F77_dcopy      dcopy_blis_impl
+#define F77_daxpy      daxpy_blis_impl
+#define F77_idamax_sub idamaxsub_blis_impl
+#define F77_cswap      cswap_blis_impl
+#define F77_ccopy      ccopy_blis_impl
+#define F77_caxpy      caxpy_blis_impl
+#define F77_icamax_sub icamaxsub_blis_impl
+#define F77_zswap      zswap_blis_impl
+#define F77_zcopy      zcopy_blis_impl
+#define F77_zaxpy      zaxpy_blis_impl
+#define F77_izamax_sub izamaxsub_blis_impl
+#define F77_sdot_sub   sdotsub_blis_impl
+#define F77_ddot_sub   ddotsub_blis_impl
+#define F77_dsdot_sub  dsdotsub_blis_impl
+#define F77_sscal      sscal_blis_impl
+#define F77_dscal      dscal_blis_impl
+#define F77_cscal      cscal_blis_impl
+#define F77_zscal      zscal_blis_impl
+#define F77_csscal     csscal_blis_impl
+#define F77_zdscal     zdscal_blis_impl
+#define F77_cdotu_sub  cdotusub_blis_impl
+#define F77_cdotc_sub  cdotcsub_blis_impl
+#define F77_zdotu_sub  zdotusub_blis_impl
+#define F77_zdotc_sub  zdotcsub_blis_impl
+#define F77_snrm2_sub  snrm2sub_blis_impl
+#define F77_sasum_sub  sasumsub_blis_impl
+#define F77_dnrm2_sub  dnrm2sub_blis_impl
+#define F77_dasum_sub  dasumsub_blis_impl
+#define F77_scnrm2_sub scnrm2sub_blis_impl
+#define F77_scasum_sub scasumsub_blis_impl
+#define F77_dznrm2_sub dznrm2sub_blis_impl
+#define F77_dzasum_sub dzasumsub_blis_impl
+#define F77_sdsdot_sub sdsdotsub_blis_impl
 /*
 * Level 2 BLAS
 */
-#define F77_ssymv ssymv_
-#define F77_ssbmv ssbmv_
-#define F77_sspmv sspmv_
-#define F77_sger  sger_
-#define F77_ssyr  ssyr_
-#define F77_sspr  sspr_
-#define F77_ssyr2 ssyr2_
-#define F77_sspr2 sspr2_
-#define F77_dsymv dsymv_
-#define F77_dsbmv dsbmv_
-#define F77_dspmv dspmv_
-#define F77_dger  dger_
-#define F77_dsyr  dsyr_
-#define F77_dspr  dspr_
-#define F77_dsyr2 dsyr2_
-#define F77_dspr2 dspr2_
-#define F77_chemv chemv_
-#define F77_chbmv chbmv_
-#define F77_chpmv chpmv_
-#define F77_cgeru cgeru_
-#define F77_cgerc cgerc_
-#define F77_cher  cher_
-#define F77_chpr  chpr_
-#define F77_cher2 cher2_
-#define F77_chpr2 chpr2_
-#define F77_zhemv zhemv_
-#define F77_zhbmv zhbmv_
-#define F77_zhpmv zhpmv_
-#define F77_zgeru zgeru_
-#define F77_zgerc zgerc_
-#define F77_zher  zher_
-#define F77_zhpr  zhpr_
-#define F77_zher2 zher2_
-#define F77_zhpr2 zhpr2_
-#define F77_sgemv sgemv_
-#define F77_sgbmv sgbmv_
-#define F77_strmv strmv_
-#define F77_stbmv stbmv_
-#define F77_stpmv stpmv_
-#define F77_strsv strsv_
-#define F77_stbsv stbsv_
-#define F77_stpsv stpsv_
-#define F77_dgemv dgemv_
-#define F77_dgbmv dgbmv_
-#define F77_dtrmv dtrmv_
-#define F77_dtbmv dtbmv_
-#define F77_dtpmv dtpmv_
-#define F77_dtrsv dtrsv_
-#define F77_dtbsv dtbsv_
-#define F77_dtpsv dtpsv_
-#define F77_cgemv cgemv_
-#define F77_cgbmv cgbmv_
-#define F77_ctrmv ctrmv_
-#define F77_ctbmv ctbmv_
-#define F77_ctpmv ctpmv_
-#define F77_ctrsv ctrsv_
-#define F77_ctbsv ctbsv_
-#define F77_ctpsv ctpsv_
-#define F77_zgemv zgemv_
-#define F77_zgbmv zgbmv_
-#define F77_ztrmv ztrmv_
-#define F77_ztbmv ztbmv_
-#define F77_ztpmv ztpmv_
-#define F77_ztrsv ztrsv_
-#define F77_ztbsv ztbsv_
-#define F77_ztpsv ztpsv_
+#define F77_ssymv ssymv_blis_impl
+#define F77_ssbmv ssbmv_blis_impl
+#define F77_sspmv sspmv_blis_impl
+#define F77_sger  sger_blis_impl
+#define F77_ssyr  ssyr_blis_impl
+#define F77_sspr  sspr_blis_impl
+#define F77_ssyr2 ssyr2_blis_impl
+#define F77_sspr2 sspr2_blis_impl
+#define F77_dsymv dsymv_blis_impl
+#define F77_dsbmv dsbmv_blis_impl
+#define F77_dspmv dspmv_blis_impl
+#define F77_dger  dger_blis_impl
+#define F77_dsyr  dsyr_blis_impl
+#define F77_dspr  dspr_blis_impl
+#define F77_dsyr2 dsyr2_blis_impl
+#define F77_dspr2 dspr2_blis_impl
+#define F77_chemv chemv_blis_impl
+#define F77_chbmv chbmv_blis_impl
+#define F77_chpmv chpmv_blis_impl
+#define F77_cgeru cgeru_blis_impl
+#define F77_cgerc cgerc_blis_impl
+#define F77_cher  cher_blis_impl
+#define F77_chpr  chpr_blis_impl
+#define F77_cher2 cher2_blis_impl
+#define F77_chpr2 chpr2_blis_impl
+#define F77_zhemv zhemv_blis_impl
+#define F77_zhbmv zhbmv_blis_impl
+#define F77_zhpmv zhpmv_blis_impl
+#define F77_zgeru zgeru_blis_impl
+#define F77_zgerc zgerc_blis_impl
+#define F77_zher  zher_blis_impl
+#define F77_zhpr  zhpr_blis_impl
+#define F77_zher2 zher2_blis_impl
+#define F77_zhpr2 zhpr2_blis_impl
+#define F77_sgemv sgemv_blis_impl
+#define F77_sgbmv sgbmv_blis_impl
+#define F77_strmv strmv_blis_impl
+#define F77_stbmv stbmv_blis_impl
+#define F77_stpmv stpmv_blis_impl
+#define F77_strsv strsv_blis_impl
+#define F77_stbsv stbsv_blis_impl
+#define F77_stpsv stpsv_blis_impl
+#define F77_dgemv dgemv_blis_impl
+#define F77_dgbmv dgbmv_blis_impl
+#define F77_dtrmv dtrmv_blis_impl
+#define F77_dtbmv dtbmv_blis_impl
+#define F77_dtpmv dtpmv_blis_impl
+#define F77_dtrsv dtrsv_blis_impl
+#define F77_dtbsv dtbsv_blis_impl
+#define F77_dtpsv dtpsv_blis_impl
+#define F77_cgemv cgemv_blis_impl
+#define F77_cgbmv cgbmv_blis_impl
+#define F77_ctrmv ctrmv_blis_impl
+#define F77_ctbmv ctbmv_blis_impl
+#define F77_ctpmv ctpmv_blis_impl
+#define F77_ctrsv ctrsv_blis_impl
+#define F77_ctbsv ctbsv_blis_impl
+#define F77_ctpsv ctpsv_blis_impl
+#define F77_zgemv zgemv_blis_impl
+#define F77_zgbmv zgbmv_blis_impl
+#define F77_ztrmv ztrmv_blis_impl
+#define F77_ztbmv ztbmv_blis_impl
+#define F77_ztpmv ztpmv_blis_impl
+#define F77_ztrsv ztrsv_blis_impl
+#define F77_ztbsv ztbsv_blis_impl
+#define F77_ztpsv ztpsv_blis_impl
 /*
 * Level 3 BLAS
 */
-#define F77_chemm  chemm_
-#define F77_cherk  cherk_
-#define F77_cher2k cher2k_
-#define F77_zhemm  zhemm_
-#define F77_zherk  zherk_
-#define F77_zher2k zher2k_
-#define F77_sgemm  sgemm_
-#define F77_ssymm  ssymm_
-#define F77_ssyrk  ssyrk_
-#define F77_ssyr2k ssyr2k_
-#define F77_strmm  strmm_
-#define F77_strsm  strsm_
-#define F77_dgemm  dgemm_
-#define F77_dsymm  dsymm_
-#define F77_dsyrk  dsyrk_
-#define F77_dsyr2k dsyr2k_
-#define F77_dtrmm  dtrmm_
-#define F77_dtrsm  dtrsm_
-#define F77_cgemm  cgemm_
-#define F77_csymm  csymm_
-#define F77_csyrk  csyrk_
-#define F77_csyr2k csyr2k_
-#define F77_ctrmm  ctrmm_
-#define F77_ctrsm  ctrsm_
-#define F77_zgemm  zgemm_
-#define F77_zsymm  zsymm_
-#define F77_zsyrk  zsyrk_
-#define F77_zsyr2k zsyr2k_
-#define F77_ztrmm  ztrmm_
-#define F77_ztrsm  ztrsm_
-#define F77_dgemmt  dgemmt_
-#define F77_sgemmt  sgemmt_
-#define F77_cgemmt  cgemmt_
-#define F77_zgemmt  zgemmt_
+#define F77_chemm  chemm_blis_impl
+#define F77_cherk  cherk_blis_impl
+#define F77_cher2k cher2k_blis_impl
+#define F77_zhemm  zhemm_blis_impl
+#define F77_zherk  zherk_blis_impl
+#define F77_zher2k zher2k_blis_impl
+#define F77_sgemm  sgemm_blis_impl
+#define F77_ssymm  ssymm_blis_impl
+#define F77_ssyrk  ssyrk_blis_impl
+#define F77_ssyr2k ssyr2k_blis_impl
+#define F77_strmm  strmm_blis_impl
+#define F77_strsm  strsm_blis_impl
+#define F77_dgemm  dgemm_blis_impl
+#define F77_dsymm  dsymm_blis_impl
+#define F77_dsyrk  dsyrk_blis_impl
+#define F77_dsyr2k dsyr2k_blis_impl
+#define F77_dtrmm  dtrmm_blis_impl
+#define F77_dtrsm  dtrsm_blis_impl
+#define F77_cgemm  cgemm_blis_impl
+#define F77_csymm  csymm_blis_impl
+#define F77_csyrk  csyrk_blis_impl
+#define F77_csyr2k csyr2k_blis_impl
+#define F77_ctrmm  ctrmm_blis_impl
+#define F77_ctrsm  ctrsm_blis_impl
+#define F77_zgemm  zgemm_blis_impl
+#define F77_zsymm  zsymm_blis_impl
+#define F77_zsyrk  zsyrk_blis_impl
+#define F77_zsyr2k zsyr2k_blis_impl
+#define F77_ztrmm  ztrmm_blis_impl
+#define F77_ztrsm  ztrsm_blis_impl
+#define F77_dgemmt  dgemmt_blis_impl
+#define F77_sgemmt  sgemmt_blis_impl
+#define F77_cgemmt  cgemmt_blis_impl
+#define F77_zgemmt  zgemmt_blis_impl
+#define F77_dzgemm  dzgemm_blis_impl
 
 /*
 * Aux Function
@@ -371,17 +372,17 @@
  * -- BLAS Extension APIs --
  */
 
-#define F77_saxpby     saxpby_
-#define F77_daxpby     daxpby_
-#define F77_caxpby     caxpby_
-#define F77_zaxpby     zaxpby_
-#define F77_cgemm3m    cgemm3m_
-#define F77_zgemm3m    zgemm3m_
+#define F77_saxpby     saxpby_blis_impl
+#define F77_daxpby     daxpby_blis_impl
+#define F77_caxpby     caxpby_blis_impl
+#define F77_zaxpby     zaxpby_blis_impl
+#define F77_cgemm3m    cgemm3m_blis_impl
+#define F77_zgemm3m    zgemm3m_blis_impl
 
-#define F77_isamin_sub isaminsub_
-#define F77_idamin_sub idaminsub_
-#define F77_icamin_sub icaminsub_
-#define F77_izamin_sub izaminsub_
+#define F77_isamin_sub isaminsub_blis_impl
+#define F77_idamin_sub idaminsub_blis_impl
+#define F77_icamin_sub icaminsub_blis_impl
+#define F77_izamin_sub izaminsub_blis_impl
 
 // -- Batch APIs --
 #define F77_sgemm_batch  sgemm_batch_
@@ -390,4 +391,4 @@
 #define F77_zgemm_batch  zgemm_batch_
 #endif
 
-#endif /*  CBLAS_F77_H */
\ No newline at end of file
+#endif /*  CBLAS_F77_H */
diff --git a/frame/compat/cblas/src/cblas_icamin.c b/frame/compat/cblas/src/cblas_icamin.c
index 7c8d5f9b08..c9019b00c7 100644
--- a/frame/compat/cblas/src/cblas_icamin.c
+++ b/frame/compat/cblas/src/cblas_icamin.c
@@ -23,4 +23,4 @@ f77_int cblas_icamin( f77_int N, const void *X, f77_int incX)
    F77_icamin_sub( &F77_N, (scomplex*)X, &F77_incX, &iamin);
    return iamin ? iamin-1 : 0;
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/cblas/src/cblas_scabs1.c b/frame/compat/cblas/src/cblas_scabs1.c
index 5eea790ee5..f9f6ca7bd3 100644
--- a/frame/compat/cblas/src/cblas_scabs1.c
+++ b/frame/compat/cblas/src/cblas_scabs1.c
@@ -47,4 +47,4 @@ float cblas_scabs1( const void *z )
 {
     return F77_scabs1( (scomplex*) z );
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/frame/compat/check/bla_gemm3m_check.h b/frame/compat/check/bla_gemm3m_check.h
index cbfd0236fe..b5a2887ce0 100644
--- a/frame/compat/check/bla_gemm3m_check.h
+++ b/frame/compat/check/bla_gemm3m_check.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +32,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_gemm3m_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \
 { \
 	f77_int info = 0; \
@@ -42,12 +40,12 @@
 	f77_int ta,    tb; \
 	f77_int nrowa, nrowb; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	notb  = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	tb    = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTE_LSAME( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTE_LSAME( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTE_LSAME( transb, "T", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -79,10 +77,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_gemm_check.h b/frame/compat/check/bla_gemm_check.h
index f500e092e2..8b68b22e0c 100644
--- a/frame/compat/check/bla_gemm_check.h
+++ b/frame/compat/check/bla_gemm_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_gemm_check( dt_str, op_str, transa, transb, m, n, k, lda, ldb, ldc ) \
 { \
 	f77_int info = 0; \
@@ -42,12 +41,12 @@
 	f77_int ta,    tb; \
 	f77_int nrowa, nrowb; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	notb  = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	tb    = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTE_LSAME( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTE_LSAME( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTE_LSAME( transb, "T", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -79,10 +78,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_gemmt_check.h b/frame/compat/check/bla_gemmt_check.h
index d08ab5558f..fb81c70732 100644
--- a/frame/compat/check/bla_gemmt_check.h
+++ b/frame/compat/check/bla_gemmt_check.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +32,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_gemmt_check( dt_str, op_str, uploc, transa, transb, n, k, lda, ldb, ldc ) \
 { \
 	f77_int info = 0; \
@@ -43,15 +41,15 @@
 	f77_int lower, upper; \
 	f77_int nrowa, nrowb; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	notb  = PASTEF770(lsame)( transb, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	conjb = PASTEF770(lsame)( transb, "C", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	tb    = PASTEF770(lsame)( transb, "T", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	notb  = PASTE_LSAME( transb, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	conjb = PASTE_LSAME( transb, "C", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	tb    = PASTE_LSAME( transb, "T", (ftnlen)1, (ftnlen)1 ); \
 \
-	lower = PASTEF770(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *n; } \
 	else        { nrowa = *k; } \
@@ -83,10 +81,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_gemv_check.h b/frame/compat/check/bla_gemv_check.h
index e827c048f1..33b0665f56 100644
--- a/frame/compat/check/bla_gemv_check.h
+++ b/frame/compat/check/bla_gemv_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,16 +33,14 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_gemv_check( dt_str, op_str, transa, m, n, lda, incx, incy ) \
 { \
 	f77_int info = 0; \
 	f77_int nota, ta, conja; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !nota && !ta && !conja ) \
 		info = 1; \
@@ -64,10 +63,9 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
 
-#endif
diff --git a/frame/compat/check/bla_ger_check.h b/frame/compat/check/bla_ger_check.h
index cdf008d8f1..cd2f4c8de3 100644
--- a/frame/compat/check/bla_ger_check.h
+++ b/frame/compat/check/bla_ger_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_ger_check( dt_str, op_str, conj_str, m, n, incx, incy, lda ) \
 { \
 	f77_int info = 0; \
@@ -59,10 +58,9 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
 
-#endif
diff --git a/frame/compat/check/bla_hemm_check.h b/frame/compat/check/bla_hemm_check.h
index a450391c05..342f485f9f 100644
--- a/frame/compat/check/bla_hemm_check.h
+++ b/frame/compat/check/bla_hemm_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_hemm_check( dt_str, op_str, sidea, uploa, m, n, lda, ldb, ldc ) \
 { \
 	f77_int info = 0; \
@@ -41,10 +40,10 @@
 	f77_int lower, upper; \
 	f77_int nrowa; \
 \
-	left  = PASTEF770(lsame)( sidea, "L", (ftnlen)1, (ftnlen)1 ); \
-	right = PASTEF770(lsame)( sidea, "R", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	left  = PASTE_LSAME( sidea, "L", (ftnlen)1, (ftnlen)1 ); \
+	right = PASTE_LSAME( sidea, "R", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( left ) { nrowa = *m; } \
 	else        { nrowa = *n; } \
@@ -72,10 +71,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_hemv_check.h b/frame/compat/check/bla_hemv_check.h
index d5865f2eba..716d434f26 100644
--- a/frame/compat/check/bla_hemv_check.h
+++ b/frame/compat/check/bla_hemv_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,15 +33,13 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_hemv_check( dt_str, op_str, uploa, m, lda, incx, incy ) \
 { \
 	f77_int info = 0; \
 	f77_int lower, upper; \
 \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !lower && !upper ) \
 		info = 1; \
@@ -61,10 +60,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_her2_check.h b/frame/compat/check/bla_her2_check.h
index 5be7299f24..f9e100612e 100644
--- a/frame/compat/check/bla_her2_check.h
+++ b/frame/compat/check/bla_her2_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,15 +33,13 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_her2_check( dt_str, op_str, uploa, m, incx, incy, lda ) \
 { \
 	f77_int info = 0; \
 	f77_int lower, upper; \
 \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !lower && !upper ) \
 		info = 1; \
@@ -61,10 +60,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_her2k_check.h b/frame/compat/check/bla_her2k_check.h
index bdca4019b0..631977d245 100644
--- a/frame/compat/check/bla_her2k_check.h
+++ b/frame/compat/check/bla_her2k_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_her2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \
 { \
 	f77_int info = 0; \
@@ -41,10 +40,10 @@
 	f77_int lower, upper; \
 	f77_int nrowa; \
 \
-	nota  = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( trans, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( trans, "C", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -72,10 +71,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_her_check.h b/frame/compat/check/bla_her_check.h
index e1a21709e9..4120f8bf9d 100644
--- a/frame/compat/check/bla_her_check.h
+++ b/frame/compat/check/bla_her_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,15 +33,13 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_her_check( dt_str, op_str, uploa, m, incx, lda ) \
 { \
 	f77_int info = 0; \
 	f77_int lower, upper; \
 \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !lower && !upper ) \
 		info = 1; \
@@ -59,10 +58,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_herk_check.h b/frame/compat/check/bla_herk_check.h
index 029ad38fec..dca2314419 100644
--- a/frame/compat/check/bla_herk_check.h
+++ b/frame/compat/check/bla_herk_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_herk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \
 { \
 	f77_int info = 0; \
@@ -41,10 +40,10 @@
 	f77_int lower, upper; \
 	f77_int nrowa; \
 \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -70,10 +69,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_symm_check.h b/frame/compat/check/bla_symm_check.h
index 14ae3e1bf3..33aff09041 100644
--- a/frame/compat/check/bla_symm_check.h
+++ b/frame/compat/check/bla_symm_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,4 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_symm_check bla_hemm_check
-
-#endif
diff --git a/frame/compat/check/bla_symv_check.h b/frame/compat/check/bla_symv_check.h
index 712b90b768..7efe08c483 100644
--- a/frame/compat/check/bla_symv_check.h
+++ b/frame/compat/check/bla_symv_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,4 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_symv_check bla_hemv_check
-
-#endif
diff --git a/frame/compat/check/bla_syr2_check.h b/frame/compat/check/bla_syr2_check.h
index a5b6820209..a3091269c3 100644
--- a/frame/compat/check/bla_syr2_check.h
+++ b/frame/compat/check/bla_syr2_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,4 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_syr2_check bla_her2_check
-
-#endif
diff --git a/frame/compat/check/bla_syr2k_check.h b/frame/compat/check/bla_syr2k_check.h
index d290d3f8b1..66bffae1b5 100644
--- a/frame/compat/check/bla_syr2k_check.h
+++ b/frame/compat/check/bla_syr2k_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_syr2k_check( dt_str, op_str, uploa, trans, m, k, lda, ldb, ldc ) \
 { \
 	f77_int info = 0; \
@@ -45,11 +44,11 @@
 	static char* dt_cst = dt_str; \
 \
 	is_r  = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \
-	nota  = PASTEF770(lsame)( trans, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( trans, "T", (ftnlen)1, (ftnlen)1 ); \
-	cta   = PASTEF770(lsame)( trans, "C", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( trans, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( trans, "T", (ftnlen)1, (ftnlen)1 ); \
+	cta   = PASTE_LSAME( trans, "C", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploa, "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploa, "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -77,10 +76,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_syr_check.h b/frame/compat/check/bla_syr_check.h
index 41070a0350..858fe3831a 100644
--- a/frame/compat/check/bla_syr_check.h
+++ b/frame/compat/check/bla_syr_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,4 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_syr_check bla_her_check
-
-#endif
diff --git a/frame/compat/check/bla_syrk_check.h b/frame/compat/check/bla_syrk_check.h
index ea140682c1..8a42eaae36 100644
--- a/frame/compat/check/bla_syrk_check.h
+++ b/frame/compat/check/bla_syrk_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_syrk_check( dt_str, op_str, uploa, transa, m, k, lda, ldc ) \
 { \
 	f77_int info = 0; \
@@ -45,11 +44,11 @@
 	static char* dt_cst = dt_str; \
 \
 	is_r  = ( dt_cst[0] == 's' || dt_cst[0] == 'd' ); \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	cta   = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	cta   = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploc,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploc,  "U", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( nota ) { nrowa = *m; } \
 	else        { nrowa = *k; } \
@@ -75,10 +74,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_trmm_check.h b/frame/compat/check/bla_trmm_check.h
index 5dba6b051c..af9d8ce493 100644
--- a/frame/compat/check/bla_trmm_check.h
+++ b/frame/compat/check/bla_trmm_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_trmm_check( dt_str, op_str, sidea, uploa, transa, diaga, m, n, lda, ldb ) \
 { \
 	f77_int info = 0; \
@@ -43,15 +42,15 @@
 	f77_int unita, nonua; \
 	f77_int nrowa; \
 \
-	left  = PASTEF770(lsame)( sidea,  "L", (ftnlen)1, (ftnlen)1 ); \
-	right = PASTEF770(lsame)( sidea,  "R", (ftnlen)1, (ftnlen)1 ); \
-	lower = PASTEF770(lsame)( uploa,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa,  "U", (ftnlen)1, (ftnlen)1 ); \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	unita = PASTEF770(lsame)( diaga,  "U", (ftnlen)1, (ftnlen)1 ); \
-	nonua = PASTEF770(lsame)( diaga,  "N", (ftnlen)1, (ftnlen)1 ); \
+	left  = PASTE_LSAME( sidea,  "L", (ftnlen)1, (ftnlen)1 ); \
+	right = PASTE_LSAME( sidea,  "R", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploa,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploa,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	unita = PASTE_LSAME( diaga,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nonua = PASTE_LSAME( diaga,  "N", (ftnlen)1, (ftnlen)1 ); \
 \
 	if ( left ) { nrowa = *m; } \
 	else        { nrowa = *n; } \
@@ -81,10 +80,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_trmv_check.h b/frame/compat/check/bla_trmv_check.h
index 108a2c70ba..e4577738fb 100644
--- a/frame/compat/check/bla_trmv_check.h
+++ b/frame/compat/check/bla_trmv_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,6 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_trmv_check( dt_str, op_str, uploa, transa, diaga, m, lda, incx ) \
 { \
 	f77_int info = 0; \
@@ -41,13 +40,13 @@
 	f77_int nota, ta, conja; \
 	f77_int unita, nonua; \
 \
-	lower = PASTEF770(lsame)( uploa,  "L", (ftnlen)1, (ftnlen)1 ); \
-	upper = PASTEF770(lsame)( uploa,  "U", (ftnlen)1, (ftnlen)1 ); \
-	nota  = PASTEF770(lsame)( transa, "N", (ftnlen)1, (ftnlen)1 ); \
-	ta    = PASTEF770(lsame)( transa, "T", (ftnlen)1, (ftnlen)1 ); \
-	conja = PASTEF770(lsame)( transa, "C", (ftnlen)1, (ftnlen)1 ); \
-	unita = PASTEF770(lsame)( diaga,  "U", (ftnlen)1, (ftnlen)1 ); \
-	nonua = PASTEF770(lsame)( diaga,  "N", (ftnlen)1, (ftnlen)1 ); \
+	lower = PASTE_LSAME( uploa,  "L", (ftnlen)1, (ftnlen)1 ); \
+	upper = PASTE_LSAME( uploa,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nota  = PASTE_LSAME( transa, "N", (ftnlen)1, (ftnlen)1 ); \
+	ta    = PASTE_LSAME( transa, "T", (ftnlen)1, (ftnlen)1 ); \
+	conja = PASTE_LSAME( transa, "C", (ftnlen)1, (ftnlen)1 ); \
+	unita = PASTE_LSAME( diaga,  "U", (ftnlen)1, (ftnlen)1 ); \
+	nonua = PASTE_LSAME( diaga,  "N", (ftnlen)1, (ftnlen)1 ); \
 \
 	if      ( !lower && !upper ) \
 		info = 1; \
@@ -70,10 +69,8 @@
 \
 		bli_string_mkupper( func_str ); \
 \
-		PASTEF770(xerbla)( func_str, &info, (ftnlen)6 ); \
+		PASTE_XERBLA( func_str, &info, (ftnlen)6 ); \
 \
 		return; \
 	} \
 }
-
-#endif
diff --git a/frame/compat/check/bla_trsm_check.h b/frame/compat/check/bla_trsm_check.h
index 7f30cec0f5..2372770bc8 100644
--- a/frame/compat/check/bla_trsm_check.h
+++ b/frame/compat/check/bla_trsm_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,4 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_trsm_check bla_trmm_check
-
-#endif
diff --git a/frame/compat/check/bla_trsv_check.h b/frame/compat/check/bla_trsv_check.h
index 68f690664c..fef8b5b790 100644
--- a/frame/compat/check/bla_trsv_check.h
+++ b/frame/compat/check/bla_trsv_check.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,8 +33,4 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define bla_trsv_check bla_trmv_check
-
-#endif
diff --git a/frame/compat/f2c/bla_cabs1.c b/frame/compat/f2c/bla_cabs1.c
index 6f29156708..56b57de0c2 100644
--- a/frame/compat/f2c/bla_cabs1.c
+++ b/frame/compat/f2c/bla_cabs1.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,14 +35,12 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* scabs1.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ bla_real PASTEF77(s,cabs1)(bla_scomplex *z)
+/* Subroutine */ bla_real PASTEF77S(s,cabs1)(bla_scomplex *z)
 {
    if ( bli_creal(*z) == 0.0f && bli_cimag(*z) == 0.0f )
    {
@@ -62,7 +60,7 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ bla_double PASTEF77(d,cabs1)(bla_dcomplex *z)
+/* Subroutine */ bla_double PASTEF77S(d,cabs1)(bla_dcomplex *z)
 {
    if ( bli_creal(*z) == 0.0 && bli_cimag(*z) == 0.0 )
    {
@@ -78,5 +76,17 @@
 
 } /* dcabs1_ */
 
+
+#ifdef BLIS_ENABLE_BLAS
+
+/* Subroutine */ bla_real PASTEF77(s,cabs1)(bla_scomplex *z)
+{
+  return PASTEF77S(s,cabs1)(z);
+}
+/* Subroutine */ bla_double PASTEF77(d,cabs1)(bla_dcomplex *z)
+{
+  return PASTEF77S(d,cabs1)(z);
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_cabs1.h b/frame/compat/f2c/bla_cabs1.h
index 753765a1d2..d7f8aff1e2 100644
--- a/frame/compat/f2c/bla_cabs1.h
+++ b/frame/compat/f2c/bla_cabs1.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS bla_real   PASTEF77(s,cabs1)(bla_scomplex *z);
 BLIS_EXPORT_BLAS bla_double PASTEF77(d,cabs1)(bla_dcomplex *z);
 
 #endif
+
+BLIS_EXPORT_BLAS bla_real   PASTEF77S(s,cabs1)(bla_scomplex *z);
+BLIS_EXPORT_BLAS bla_double PASTEF77S(d,cabs1)(bla_dcomplex *z);
diff --git a/frame/compat/f2c/bla_gbmv.c b/frame/compat/f2c/bla_gbmv.c
index d53dd322ad..d6c4076fd8 100644
--- a/frame/compat/f2c/bla_gbmv.c
+++ b/frame/compat/f2c/bla_gbmv.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* cgbmv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer lenx, leny, i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj;
     bla_integer kup1;
 
@@ -203,8 +203,8 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (
-	    ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
+    if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "T", (
+	    ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (ftnlen)1)
 	    ) {
 	info = 1;
     } else if (*m < 0) {
@@ -223,7 +223,7 @@
 	info = 13;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CGBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("CGBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -234,12 +234,12 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
 
 /*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
 /*     up the start points in  X  and  Y. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -308,7 +308,7 @@
 	return 0;
     }
     kup1 = *ku + 1;
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y := alpha*A*x + y. */
 
@@ -482,7 +482,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
@@ -491,9 +492,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer lenx, leny, i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_integer kup1;
 
 /*     .. Scalar Arguments .. */
@@ -635,8 +636,8 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (
-	    ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
+    if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "T", (
+	    ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (ftnlen)1)
 	    ) {
 	info = 1;
     } else if (*m < 0) {
@@ -655,7 +656,7 @@
 	info = 13;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DGBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("DGBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -668,7 +669,7 @@
 /*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
 /*     up the start points in  X  and  Y. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -729,7 +730,7 @@
 	return 0;
     }
     kup1 = *ku + 1;
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y := alpha*A*x + y. */
 
@@ -838,7 +839,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
@@ -847,9 +849,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer lenx, leny, i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_integer kup1;
 
 /*     .. Scalar Arguments .. */
@@ -991,8 +993,8 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (
-	    ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
+    if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "T", (
+	    ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (ftnlen)1)
 	    ) {
 	info = 1;
     } else if (*m < 0) {
@@ -1011,7 +1013,7 @@
 	info = 13;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SGBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("SGBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1024,7 +1026,7 @@
 /*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
 /*     up the start points in  X  and  Y. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -1085,7 +1087,7 @@
 	return 0;
     }
     kup1 = *ku + 1;
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y := alpha*A*x + y. */
 
@@ -1194,7 +1196,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
@@ -1207,9 +1210,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer lenx, leny, i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj;
     bla_integer kup1;
 
@@ -1356,8 +1359,8 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "T", (
-	    ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (ftnlen)1)
+    if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "T", (
+	    ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (ftnlen)1)
 	    ) {
 	info = 1;
     } else if (*m < 0) {
@@ -1376,7 +1379,7 @@
 	info = 13;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZGBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZGBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1387,12 +1390,12 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
 
 /*     Set  LENX  and  LENY, the lengths of the vectors x and y, and set */
 /*     up the start points in  X  and  Y. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 	lenx = *n;
 	leny = *m;
     } else {
@@ -1461,7 +1464,7 @@
 	return 0;
     }
     kup1 = *ku + 1;
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y := alpha*A*x + y. */
 
@@ -1630,5 +1633,26 @@
 
 } /* zgbmv_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
+{
+  return PASTEF77S(s,gbmv)( trans, m, n, kl, ku, alpha, a, lda, x,  incx, beta, y, incy );
+}
+
+int PASTEF77(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
+{
+  return PASTEF77S(d,gbmv)( trans, m, n, kl, ku, alpha, a, lda, x,  incx, beta, y, incy );
+}
+int PASTEF77(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
+{
+  return PASTEF77S(c,gbmv)( trans, m, n, kl, ku, alpha, a, lda, x,  incx, beta, y, incy );
+}
+
+int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy)
+{
+  return PASTEF77S(z,gbmv)( trans, m, n, kl, ku, alpha, a, lda, x,  incx, beta, y, incy );
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_gbmv.h b/frame/compat/f2c/bla_gbmv.h
index eb8ce25342..079b4e355a 100644
--- a/frame/compat/f2c/bla_gbmv.h
+++ b/frame/compat/f2c/bla_gbmv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,3 +41,8 @@ BLIS_EXPORT_BLAS int PASTEF77(s,gbmv)(const bla_character *trans, const bla_inte
 BLIS_EXPORT_BLAS int PASTEF77(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy);
+BLIS_EXPORT_BLAS int PASTEF77S(d,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy);
+BLIS_EXPORT_BLAS int PASTEF77S(s,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer * incx, const bla_real *beta, bla_real *y, const bla_integer *incy);
+BLIS_EXPORT_BLAS int PASTEF77S(z,gbmv)(const bla_character *trans, const bla_integer *m, const bla_integer *n, const bla_integer *kl, const bla_integer *ku, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex * y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_hbmv.c b/frame/compat/f2c/bla_hbmv.c
index 198336d048..f07e80f394 100644
--- a/frame/compat/f2c/bla_hbmv.c
+++ b/frame/compat/f2c/bla_hbmv.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* chbmv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex * alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex * alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
@@ -55,9 +55,9 @@
     bla_integer info;
     bla_scomplex temp1, temp2;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -204,7 +204,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -219,7 +219,7 @@
 	info = 11;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CHBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("CHBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -293,7 +293,7 @@
     if (bli_creal(*alpha) == 0.f && bli_cimag(*alpha) == 0.f) {
 	return 0;
     }
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when upper triangle of A is stored. */
 
@@ -487,7 +487,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer * incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer * incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
@@ -501,9 +502,9 @@
     bla_integer info;
     bla_dcomplex temp1, temp2;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -650,7 +651,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -665,7 +666,7 @@
 	info = 11;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZHBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZHBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -739,7 +740,7 @@
     if (bli_zreal(*alpha) == 0. && bli_zimag(*alpha) == 0.) {
 	return 0;
     }
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when upper triangle of A is stored. */
 
@@ -928,5 +929,17 @@
 
 } /* zhbmv_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex * alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
+{
+  return PASTEF77S(c,hbmv)( uplo, n, k,  alpha, a, lda, x, incx, beta, y, incy ); 
+}
+
+int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer * incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy)
+{
+  return PASTEF77S(z,hbmv)( uplo, n, k,  alpha, a, lda, x, incx, beta, y, incy ); 
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_hbmv.h b/frame/compat/f2c/bla_hbmv.h
index 1ddb838071..6a2880ed6d 100644
--- a/frame/compat/f2c/bla_hbmv.h
+++ b/frame/compat/f2c/bla_hbmv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(c,hbmv)(const bla_character *uplo, const bla_integ
 BLIS_EXPORT_BLAS int PASTEF77(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_scomplex *alpha, const bla_scomplex *a, const bla_integer *lda, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy);
+BLIS_EXPORT_BLAS int PASTEF77S(z,hbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_dcomplex *alpha, const bla_dcomplex *a, const bla_integer *lda, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_hpmv.c b/frame/compat/f2c/bla_hpmv.c
index 0d7ebce9d7..9743aaf835 100644
--- a/frame/compat/f2c/bla_hpmv.c
+++ b/frame/compat/f2c/bla_hpmv.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* chpmv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex * ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex * ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5;
@@ -55,9 +55,9 @@
     bla_integer info;
     bla_scomplex temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -168,7 +168,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -179,7 +179,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CHPMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("CHPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -254,7 +254,7 @@
 	return 0;
     }
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when AP contains the upper triangle. */
 
@@ -439,7 +439,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5;
@@ -453,9 +454,9 @@
     bla_integer info;
     bla_dcomplex temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -566,7 +567,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -577,7 +578,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZHPMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZHPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -652,7 +653,7 @@
 	return 0;
     }
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when AP contains the upper triangle. */
 
@@ -832,5 +833,17 @@
 
 } /* zhpmv_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex * ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy)
+{
+  return PASTEF77S(c,hpmv)( uplo, n, alpha,  ap, x, incx, beta, y, incy ); 
+}
+
+int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy)
+{
+  return PASTEF77S(z,hpmv)( uplo, n, alpha,  ap, x, incx, beta, y, incy ); 
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_hpmv.h b/frame/compat/f2c/bla_hpmv.h
index 26d055effd..4c2818aab6 100644
--- a/frame/compat/f2c/bla_hpmv.h
+++ b/frame/compat/f2c/bla_hpmv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(c,hpmv)(const bla_character *uplo, const bla_integ
 BLIS_EXPORT_BLAS int PASTEF77(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *ap, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *beta, bla_scomplex *y, const bla_integer *incy);
+BLIS_EXPORT_BLAS int PASTEF77S(z,hpmv)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *ap, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *beta, bla_dcomplex *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_hpr.c b/frame/compat/f2c/bla_hpr.c
index da1f0a0f39..f3f591a8cc 100644
--- a/frame/compat/f2c/bla_hpr.c
+++ b/frame/compat/f2c/bla_hpr.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* chpr.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap)
+/* Subroutine */ 
+int PASTEF77S(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5;
@@ -55,9 +55,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -155,7 +155,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -164,7 +164,7 @@
 	info = 5;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CHPR  ", &info, (ftnlen)6);
+	PASTE_XERBLA("CHPR  ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -186,7 +186,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -353,7 +353,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap)
+/* Subroutine */ 
+int PASTEF77S(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5;
@@ -367,9 +368,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -467,7 +468,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -476,7 +477,7 @@
 	info = 5;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZHPR  ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZHPR  ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -498,7 +499,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -660,5 +661,17 @@
 
 } /* zhpr_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap)
+{
+  return PASTEF77S(c,hpr)( uplo, n, alpha, x, incx, ap ); 
+}
+
+int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap)
+{
+  return PASTEF77S(z,hpr)( uplo, n, alpha, x, incx, ap ); 
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_hpr.h b/frame/compat/f2c/bla_hpr.h
index cfce9e1779..02c9e033d1 100644
--- a/frame/compat/f2c/bla_hpr.h
+++ b/frame/compat/f2c/bla_hpr.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(c,hpr)(const bla_character *uplo, const bla_intege
 BLIS_EXPORT_BLAS int PASTEF77(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,hpr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_scomplex *x, const bla_integer *incx, bla_scomplex *ap);
+BLIS_EXPORT_BLAS int PASTEF77S(z,hpr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_dcomplex *x, const bla_integer *incx, bla_dcomplex *ap);
diff --git a/frame/compat/f2c/bla_hpr2.c b/frame/compat/f2c/bla_hpr2.c
index c78c1eec04..75d0c54169 100644
--- a/frame/compat/f2c/bla_hpr2.c
+++ b/frame/compat/f2c/bla_hpr2.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* chpr2.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap)
+/* Subroutine */ 
+int PASTEF77S(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5, i__6;
@@ -55,9 +55,9 @@
     bla_integer info;
     bla_scomplex temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -167,7 +167,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -178,7 +178,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CHPR2 ", &info, (ftnlen)6);
+	PASTE_XERBLA("CHPR2 ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -210,7 +210,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -429,7 +429,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap)
+/* Subroutine */ 
+int PASTEF77S(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5, i__6;
@@ -443,9 +444,9 @@
     bla_integer info;
     bla_dcomplex temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -555,7 +556,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -566,7 +567,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZHPR2 ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZHPR2 ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -598,7 +599,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -812,5 +813,17 @@
 
 } /* zhpr2_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap)
+{
+  return PASTEF77S(c,hpr2)( uplo, n, alpha, x, incx, y, incy, ap ); 
+}
+
+int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap)
+{
+  return PASTEF77S(z,hpr2)( uplo, n, alpha, x, incx, y, incy, ap ); 
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_hpr2.h b/frame/compat/f2c/bla_hpr2.h
index 16f929d611..a8feab31b7 100644
--- a/frame/compat/f2c/bla_hpr2.h
+++ b/frame/compat/f2c/bla_hpr2.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(c,hpr2)(const bla_character *uplo, const bla_integ
 BLIS_EXPORT_BLAS int PASTEF77(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_scomplex *alpha, const bla_scomplex *x, const bla_integer *incx, const bla_scomplex *y, const bla_integer *incy, bla_scomplex *ap);
+BLIS_EXPORT_BLAS int PASTEF77S(z,hpr2)(const bla_character *uplo, const bla_integer *n, const bla_dcomplex *alpha, const bla_dcomplex *x, const bla_integer *incx, const bla_dcomplex *y, const bla_integer *incy, bla_dcomplex *ap);
diff --git a/frame/compat/f2c/bla_lsame.c b/frame/compat/f2c/bla_lsame.c
index edee918d13..9e6135d874 100644
--- a/frame/compat/f2c/bla_lsame.c
+++ b/frame/compat/f2c/bla_lsame.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,18 +35,15 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* lsame.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-
 #ifdef LAPACK_ILP64
-long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len)
+long lsame_blis_impl(const char *ca, const char *cb, long ca_len, long cb_len)
 #else
-int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len)
+int lsame_blis_impl(const char *ca, const char *cb, int ca_len, int cb_len)
 #endif
 {
     /* System generated locals */
@@ -147,6 +145,22 @@ int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len)
 
 /*     End of LSAME */
 
+    return ret_val;
+} /* lsame_blis_impl */
+
+
+#ifdef BLIS_ENABLE_BLAS
+
+#ifdef LAPACK_ILP64
+long PASTEF770(lsame)(const char *ca, const char *cb, long ca_len, long cb_len)
+#else
+int PASTEF770(lsame)(const char *ca, const char *cb, int ca_len, int cb_len)
+#endif
+{
+    /* System generated locals */
+    bla_logical ret_val;
+
+    ret_val = lsame_blis_impl(ca, cb, ca_len, cb_len);
     return ret_val;
 } /* lsame */
 
diff --git a/frame/compat/f2c/bla_lsame.h b/frame/compat/f2c/bla_lsame.h
index 656032688d..1d3113484b 100644
--- a/frame/compat/f2c/bla_lsame.h
+++ b/frame/compat/f2c/bla_lsame.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,6 +33,12 @@
 
 */
 
+#ifdef LAPACK_ILP64
+long lsame_blis_impl(const char *ca, const char *cb, long ca_len, long cb_len);
+#else
+BLIS_EXPORT_BLAS int lsame_blis_impl(const char *ca, const char *cb, int ca_len, int cb_len);
+#endif
+
 #ifdef BLIS_ENABLE_BLAS
 
 #ifdef LAPACK_ILP64
diff --git a/frame/compat/f2c/bla_rot.c b/frame/compat/f2c/bla_rot.c
index c79769bc05..44430ae646 100644
--- a/frame/compat/f2c/bla_rot.c
+++ b/frame/compat/f2c/bla_rot.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* srot.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s)
+/* Subroutine */ 
+int PASTEF77S(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s)
 {
     /* System generated locals */
     bla_integer i__1;
@@ -109,7 +109,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s)
+/* Subroutine */ 
+int PASTEF77S(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s)
 {
     /* System generated locals */
     bla_integer i__1;
@@ -172,12 +173,13 @@
     return 0;
 } /* drot_ */
 
+
 /* csrot.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s)
+/* Subroutine */ int PASTEF77S(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4;
@@ -270,7 +272,7 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s)
+/* Subroutine */ int PASTEF77S(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4;
@@ -358,5 +360,27 @@
     return 0;
 } /* zdrot_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s)
+{
+  return PASTEF77S(s,rot)( n, sx, incx, sy, incy, c__, s );
+}
+
+int PASTEF77(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s)
+{
+  return PASTEF77S(d,rot)( n, dx, incx, dy, incy, c__, s );
+}
+
+int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s)
+{
+  return PASTEF77S(cs,rot)(n, cx, incx, cy, incy, c__, s);
+}
+
+int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s)
+{
+  return PASTEF77S(zd,rot)(n, zx, incx, zy, incy, c__, s);
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_rot.h b/frame/compat/f2c/bla_rot.h
index 6093555600..c0d5ec90f7 100644
--- a/frame/compat/f2c/bla_rot.h
+++ b/frame/compat/f2c/bla_rot.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,3 +41,8 @@ BLIS_EXPORT_BLAS int PASTEF77(cs,rot)(const bla_integer *n, bla_scomplex *cx, co
 BLIS_EXPORT_BLAS int PASTEF77(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(s,rot)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *c__, const bla_real *s);
+BLIS_EXPORT_BLAS int PASTEF77S(d,rot)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
+BLIS_EXPORT_BLAS int PASTEF77S(cs,rot)(const bla_integer *n, bla_scomplex *cx, const bla_integer *incx, bla_scomplex *cy, const bla_integer *incy, const bla_real *c__, const bla_real *s);
+BLIS_EXPORT_BLAS int PASTEF77S(zd,rot)(const bla_integer *n, bla_dcomplex *zx, const bla_integer *incx, bla_dcomplex *zy, const bla_integer *incy, const bla_double *c__, const bla_double *s);
diff --git a/frame/compat/f2c/bla_rotg.c b/frame/compat/f2c/bla_rotg.c
index 1572689f57..e8579533d4 100644
--- a/frame/compat/f2c/bla_rotg.c
+++ b/frame/compat/f2c/bla_rotg.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,8 +35,6 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* srotg.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
@@ -45,7 +44,8 @@
 
 static bla_real sc_b4 = 1.f;
 
-/* Subroutine */ int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s)
+/* Subroutine */ 
+int PASTEF77S(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s)
 {
     /* System generated locals */
     bla_real r__1, r__2;
@@ -105,7 +105,8 @@ static bla_real sc_b4 = 1.f;
 
 static bla_double dc_b4 = 1.;
 
-/* Subroutine */ int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s)
+/* Subroutine */ 
+int PASTEF77S(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s)
 {
     /* System generated locals */
     bla_double d__1, d__2;
@@ -161,7 +162,7 @@ static bla_double dc_b4 = 1.;
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s)
+/* Subroutine */ int PASTEF77S(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s)
 {
     /* System generated locals */
     bla_real r__1, r__2;
@@ -211,7 +212,7 @@ static bla_double dc_b4 = 1.;
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s)
+/* Subroutine */ int PASTEF77S(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s)
 {
     /* System generated locals */
     bla_double d__1, d__2;
@@ -260,5 +261,27 @@ static bla_double dc_b4 = 1.;
     return 0;
 } /* zrotg_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s)
+{
+  return PASTEF77S(s,rotg)( sa, sb, c__, s );
+}
+
+int PASTEF77(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s)
+{
+  return PASTEF77S(d,rotg)( da, db, c__, s );
+}
+
+int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s)
+{
+  return PASTEF77S(c,rotg)( ca, cb, c__, s );
+}
+
+int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s)
+{
+  return PASTEF77S(z,rotg)( ca, cb, c__, s );
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_rotg.h b/frame/compat/f2c/bla_rotg.h
index b968ebbea2..d88f731de8 100644
--- a/frame/compat/f2c/bla_rotg.h
+++ b/frame/compat/f2c/bla_rotg.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,3 +41,8 @@ BLIS_EXPORT_BLAS int PASTEF77(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_re
 BLIS_EXPORT_BLAS int PASTEF77(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(s,rotg)(bla_real *sa, bla_real *sb, bla_real *c__, bla_real *s);
+BLIS_EXPORT_BLAS int PASTEF77S(d,rotg)(bla_double *da, bla_double *db, bla_double *c__, bla_double *s);
+BLIS_EXPORT_BLAS int PASTEF77S(c,rotg)(bla_scomplex *ca, bla_scomplex *cb, bla_real *c__, bla_scomplex *s);
+BLIS_EXPORT_BLAS int PASTEF77S(z,rotg)(bla_dcomplex *ca, bla_dcomplex *cb, bla_double *c__, bla_dcomplex *s);
diff --git a/frame/compat/f2c/bla_rotm.c b/frame/compat/f2c/bla_rotm.c
index 003dea7155..cb3c8dc489 100644
--- a/frame/compat/f2c/bla_rotm.c
+++ b/frame/compat/f2c/bla_rotm.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* srotm.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam)
+/* Subroutine */ 
+int PASTEF77S(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam)
 {
     /* Initialized data */
 
@@ -207,7 +207,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam)
+/* Subroutine */ 
+int PASTEF77S(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam)
 {
     /* Initialized data */
 
@@ -368,5 +369,17 @@
     return 0;
 } /* drotm_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam)
+{
+  return PASTEF77S(s,rotm)( n, sx, incx, sy, incy, sparam);
+}
+
+int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam)
+{
+  return PASTEF77S(d,rotm)( n, dx, incx, dy, incy, dparam);
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_rotm.h b/frame/compat/f2c/bla_rotm.h
index 21906358be..307b2cca72 100644
--- a/frame/compat/f2c/bla_rotm.h
+++ b/frame/compat/f2c/bla_rotm.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(s,rotm)(const bla_integer *n, bla_real *sx, const
 BLIS_EXPORT_BLAS int PASTEF77(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(s,rotm)(const bla_integer *n, bla_real *sx, const bla_integer *incx, bla_real *sy, const bla_integer *incy, const bla_real *sparam);
+BLIS_EXPORT_BLAS int PASTEF77S(d,rotm)(const bla_integer *n, bla_double *dx, const bla_integer *incx, bla_double *dy, const bla_integer *incy, const bla_double *dparam);
diff --git a/frame/compat/f2c/bla_rotmg.c b/frame/compat/f2c/bla_rotmg.c
index 11ccc6f333..275166493b 100644
--- a/frame/compat/f2c/bla_rotmg.c
+++ b/frame/compat/f2c/bla_rotmg.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* srotmg.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam)
+/* Subroutine */ 
+int PASTEF77S(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam)
 {
     /* Initialized data */
 
@@ -281,7 +281,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam)
+/* Subroutine */ 
+int PASTEF77S(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam)
 {
     /* Initialized data */
 
@@ -516,5 +517,17 @@
     return 0;
 } /* drotmg_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam)
+{
+  return PASTEF77S(s,rotmg)( sd1, sd2, sx1, sy1, sparam );
+}
+
+int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam)
+{
+  return PASTEF77S(d,rotmg)( dd1, dd2, dx1, dy1, dparam );
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_rotmg.h b/frame/compat/f2c/bla_rotmg.h
index 63e9710da1..be6034c7ed 100644
--- a/frame/compat/f2c/bla_rotmg.h
+++ b/frame/compat/f2c/bla_rotmg.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *s
 BLIS_EXPORT_BLAS int PASTEF77(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(s,rotmg)(bla_real *sd1, bla_real *sd2, bla_real *sx1, const bla_real *sy1, bla_real *sparam);
+BLIS_EXPORT_BLAS int PASTEF77S(d,rotmg)(bla_double *dd1, bla_double *dd2, bla_double *dx1, const bla_double *dy1, bla_double *dparam);
diff --git a/frame/compat/f2c/bla_sbmv.c b/frame/compat/f2c/bla_sbmv.c
index 566fabd81c..c30c976b2d 100644
--- a/frame/compat/f2c/bla_sbmv.c
+++ b/frame/compat/f2c/bla_sbmv.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* dsbmv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -50,9 +50,9 @@
     bla_integer info;
     bla_double temp1, temp2;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -197,7 +197,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -212,7 +212,7 @@
 	info = 11;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DSBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("DSBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -277,7 +277,7 @@
     if (*alpha == 0.) {
 	return 0;
     }
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when upper triangle of A is stored. */
 
@@ -392,7 +392,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -401,9 +402,9 @@
     bla_integer info;
     bla_real temp1, temp2;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -548,7 +549,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -563,7 +564,7 @@
 	info = 11;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SSBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("SSBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -628,7 +629,7 @@
     if (*alpha == 0.f) {
 	return 0;
     }
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when upper triangle of A is stored. */
 
@@ -738,5 +739,17 @@
 
 } /* ssbmv_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
+{
+  return PASTEF77S(d,sbmv)(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+}
+
+int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
+{
+  return PASTEF77S(s,sbmv)(uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_sbmv.h b/frame/compat/f2c/bla_sbmv.h
index c3f3fc24f8..9a15e0aab3 100644
--- a/frame/compat/f2c/bla_sbmv.h
+++ b/frame/compat/f2c/bla_sbmv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(d,sbmv)(const bla_character *uplo, const bla_integ
 BLIS_EXPORT_BLAS int PASTEF77(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(d,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_double *alpha, const bla_double *a, const bla_integer *lda, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy);
+BLIS_EXPORT_BLAS int PASTEF77S(s,sbmv)(const bla_character *uplo, const bla_integer *n, const bla_integer *k, const bla_real *alpha, const bla_real *a, const bla_integer *lda, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_spmv.c b/frame/compat/f2c/bla_spmv.c
index 0485e1dc3a..64cb020828 100644
--- a/frame/compat/f2c/bla_spmv.c
+++ b/frame/compat/f2c/bla_spmv.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* dspmv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -50,9 +50,9 @@
     bla_integer info;
     bla_double temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -160,7 +160,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -171,7 +171,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DSPMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("DSPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -237,7 +237,7 @@
 	return 0;
     }
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when AP contains the upper triangle. */
 
@@ -342,7 +342,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
+/* Subroutine */ 
+int PASTEF77S(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -351,9 +352,9 @@
     bla_integer info;
     bla_real temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx, jy, kx, ky;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -461,7 +462,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -472,7 +473,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SSPMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("SSPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -538,7 +539,7 @@
 	return 0;
     }
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  y  when AP contains the upper triangle. */
 
@@ -638,5 +639,17 @@
 
 } /* sspmv_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy)
+{
+  return PASTEF77S(d,spmv)( uplo, n, alpha, ap, x, incx, beta, y, incy);
+}
+
+int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy)
+{
+  return PASTEF77S(s,spmv)( uplo, n, alpha, ap, x, incx, beta, y, incy);
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_spmv.h b/frame/compat/f2c/bla_spmv.h
index 7db7d4a8b6..06845d1273 100644
--- a/frame/compat/f2c/bla_spmv.h
+++ b/frame/compat/f2c/bla_spmv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(d,spmv)(const bla_character *uplo, const bla_integ
 BLIS_EXPORT_BLAS int PASTEF77(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(d,spmv)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *ap, const bla_double *x, const bla_integer *incx, const bla_double *beta, bla_double *y, const bla_integer *incy);
+BLIS_EXPORT_BLAS int PASTEF77S(s,spmv)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *ap, const bla_real *x, const bla_integer *incx, const bla_real *beta, bla_real *y, const bla_integer *incy);
diff --git a/frame/compat/f2c/bla_spr.c b/frame/compat/f2c/bla_spr.c
index d276458b49..fd99de5508 100644
--- a/frame/compat/f2c/bla_spr.c
+++ b/frame/compat/f2c/bla_spr.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* dspr.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap)
+/* Subroutine */ 
+int PASTEF77S(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -50,9 +50,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -146,7 +146,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -155,7 +155,7 @@
 	info = 5;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DSPR  ", &info, (ftnlen)6);
+	PASTE_XERBLA("DSPR  ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -177,7 +177,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -268,7 +268,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap)
+/* Subroutine */ 
+int PASTEF77S(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -277,9 +278,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -373,7 +374,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -382,7 +383,7 @@
 	info = 5;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SSPR  ", &info, (ftnlen)6);
+	PASTE_XERBLA("SSPR  ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -404,7 +405,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -490,5 +491,17 @@
 
 } /* sspr_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap)
+{
+  return PASTEF77S(d,spr)( uplo, n, alpha, x, incx, ap );
+}
+
+int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap)
+{
+  return PASTEF77S(s,spr)( uplo, n, alpha, x, incx, ap );
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_spr.h b/frame/compat/f2c/bla_spr.h
index 6712d7c166..882d400550 100644
--- a/frame/compat/f2c/bla_spr.h
+++ b/frame/compat/f2c/bla_spr.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(d,spr)(const bla_character *uplo, const bla_intege
 BLIS_EXPORT_BLAS int PASTEF77(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(d,spr)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, bla_double *ap);
+BLIS_EXPORT_BLAS int PASTEF77S(s,spr)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, bla_real *ap);
diff --git a/frame/compat/f2c/bla_spr2.c b/frame/compat/f2c/bla_spr2.c
index 7c75382122..a67ef4800d 100644
--- a/frame/compat/f2c/bla_spr2.c
+++ b/frame/compat/f2c/bla_spr2.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* dspr2.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap)
+/* Subroutine */ 
+int PASTEF77S(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -50,9 +50,9 @@
     bla_integer info;
     bla_double temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -158,7 +158,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -169,7 +169,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DSPR2 ", &info, (ftnlen)6);
+	PASTE_XERBLA("DSPR2 ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -201,7 +201,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -300,7 +300,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap)
+/* Subroutine */ 
+int PASTEF77S(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -309,9 +310,9 @@
     bla_integer info;
     bla_real temp1, temp2;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, iy, jx = 0, jy = 0, kx = 0, ky = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
 
 /*     .. Scalar Arguments .. */
 /*     .. Array Arguments .. */
@@ -417,7 +418,7 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
     } else if (*n < 0) {
@@ -428,7 +429,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("SSPR2 ", &info, (ftnlen)6);
+	PASTE_XERBLA("SSPR2 ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -460,7 +461,7 @@
 /*     are accessed sequentially with one pass through AP. */
 
     kk = 1;
-    if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  A  when upper triangle is stored in AP. */
 
@@ -554,5 +555,17 @@
 
 } /* sspr2_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap)
+{
+  return PASTEF77S(d,spr2)( uplo, n, alpha, x, incx, y, incy,ap );
+}
+
+int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap)
+{
+  return PASTEF77S(s,spr2)( uplo, n, alpha, x, incx, y, incy,ap );
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_spr2.h b/frame/compat/f2c/bla_spr2.h
index 5a1d607471..242cbfe825 100644
--- a/frame/compat/f2c/bla_spr2.h
+++ b/frame/compat/f2c/bla_spr2.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -38,3 +39,6 @@ BLIS_EXPORT_BLAS int PASTEF77(d,spr2)(const bla_character *uplo, const bla_integ
 BLIS_EXPORT_BLAS int PASTEF77(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(d,spr2)(const bla_character *uplo, const bla_integer *n, const bla_double *alpha, const bla_double *x, const bla_integer *incx, const bla_double *y, const bla_integer *incy, bla_double *ap);
+BLIS_EXPORT_BLAS int PASTEF77S(s,spr2)(const bla_character *uplo, const bla_integer *n, const bla_real *alpha, const bla_real *x, const bla_integer *incx, const bla_real *y, const bla_integer *incy, bla_real *ap);
diff --git a/frame/compat/f2c/bla_tbmv.c b/frame/compat/f2c/bla_tbmv.c
index 78feb70562..6c0454c9a8 100644
--- a/frame/compat/f2c/bla_tbmv.c
+++ b/frame/compat/f2c/bla_tbmv.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* ctbmv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx)
+/* Subroutine */
+int PASTEF77S(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -209,14 +209,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -229,7 +229,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CTBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("CTBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -239,8 +239,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX   too small for descending loops. */
@@ -254,11 +254,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*         Form  x := A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -401,7 +401,7 @@
 
 /*        Form  x := A'*x  or  x := conjg( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -611,7 +611,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx)
+/* Subroutine */
+int PASTEF77S(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -620,9 +621,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -775,14 +776,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -795,7 +796,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DTBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("DTBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -805,7 +806,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX   too small for descending loops. */
@@ -819,11 +820,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*         Form  x := A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -922,7 +923,7 @@
 
 /*        Form  x := A'*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1022,7 +1023,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx)
+/* Subroutine */
+int PASTEF77S(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -1031,9 +1033,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1186,14 +1188,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1206,7 +1208,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("STBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("STBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1216,7 +1218,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX   too small for descending loops. */
@@ -1230,11 +1232,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*         Form  x := A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1333,7 +1335,7 @@
 
 /*        Form  x := A'*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1433,7 +1435,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx)
+/* Subroutine */
+int PASTEF77S(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
@@ -1446,9 +1449,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1601,14 +1604,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans,
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag,
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1621,7 +1624,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZTBMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZTBMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1631,8 +1634,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX   too small for descending loops. */
@@ -1646,11 +1649,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*         Form  x := A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1793,7 +1796,7 @@
 
 /*        Form  x := A'*x  or  x := conjg( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1998,5 +2001,27 @@
 
 } /* ztbmv_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx)
+{
+  return PASTEF77S(s,tbmv)( uplo, trans, diag, n, k, a, lda, x, incx );
+}
+
+int PASTEF77(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx)
+{
+  return PASTEF77S(d,tbmv)( uplo, trans, diag, n, k, a, lda, x, incx );
+}
+
+int PASTEF77(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx)
+{
+  return PASTEF77S(c,tbmv)( uplo, trans, diag, n, k, a, lda, x, incx );
+}
+
+int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx)
+{
+  return PASTEF77S(z,tbmv)( uplo, trans, diag, n, k, a, lda, x, incx );
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_tbmv.h b/frame/compat/f2c/bla_tbmv.h
index f34654762b..4fd1a85fee 100644
--- a/frame/compat/f2c/bla_tbmv.h
+++ b/frame/compat/f2c/bla_tbmv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,3 +41,8 @@ BLIS_EXPORT_BLAS int PASTEF77(s,tbmv)(const bla_character *uplo, const bla_chara
 BLIS_EXPORT_BLAS int PASTEF77(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(d,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(s,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(z,tbmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx);
diff --git a/frame/compat/f2c/bla_tbsv.c b/frame/compat/f2c/bla_tbsv.c
index 819456f029..f7e9e804bd 100644
--- a/frame/compat/f2c/bla_tbsv.c
+++ b/frame/compat/f2c/bla_tbsv.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* ctbsv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
@@ -55,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -214,14 +213,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -234,7 +233,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CTBSV ", &info, (ftnlen)6);
+	PASTE_XERBLA("CTBSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -244,8 +243,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -259,11 +258,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed by sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -412,7 +411,7 @@
 
 /*        Form  x := inv( A' )*x  or  x := inv( conjg( A') )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -622,7 +621,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -631,9 +631,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -790,14 +790,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -810,7 +810,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DTBSV ", &info, (ftnlen)6);
+	PASTE_XERBLA("DTBSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -820,7 +820,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -834,11 +834,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed by sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -949,7 +949,7 @@
 
 /*        Form  x := inv( A')*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1053,7 +1053,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -1062,9 +1063,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1221,14 +1222,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1241,7 +1242,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("STBSV ", &info, (ftnlen)6);
+	PASTE_XERBLA("STBSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1251,7 +1252,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1265,11 +1266,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed by sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1380,7 +1381,7 @@
 
 /*        Form  x := inv( A')*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1484,7 +1485,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;
@@ -1498,9 +1500,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, l;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kplus1, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1657,14 +1659,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1677,7 +1679,7 @@
 	info = 9;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZTBSV ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZTBSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1687,8 +1689,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1702,11 +1704,11 @@
 /*     Start the operations. In this version the elements of A are */
 /*     accessed by sequentially with one pass through A. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1853,7 +1855,7 @@
 
 /*        Form  x := inv( A' )*x  or  x := inv( conjg( A') )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kplus1 = *k + 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -2058,5 +2060,27 @@
 
 } /* ztbsv_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx)
+{
+  return PASTEF77S(s,tbsv)( uplo, trans, diag, n, k, a, lda, x, incx );
+}
+
+int PASTEF77(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx)
+{
+  return PASTEF77S(d,tbsv)( uplo, trans, diag, n, k, a, lda, x, incx );
+}
+
+int PASTEF77(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx)
+{
+  return PASTEF77S(c,tbsv)( uplo, trans, diag, n, k, a, lda, x, incx );
+}
+
+int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx)
+{
+  return PASTEF77S(z,tbsv)( uplo, trans, diag, n, k, a, lda, x, incx );
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_tbsv.h b/frame/compat/f2c/bla_tbsv.h
index 5e84f5c363..52202a8a88 100644
--- a/frame/compat/f2c/bla_tbsv.h
+++ b/frame/compat/f2c/bla_tbsv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,3 +41,8 @@ BLIS_EXPORT_BLAS int PASTEF77(s,tbsv)(const bla_character *uplo, const bla_chara
 BLIS_EXPORT_BLAS int PASTEF77(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_scomplex *a, const bla_integer *lda, bla_scomplex *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(d,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_double *a, const bla_integer *lda, bla_double *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(s,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_real *a, const bla_integer *lda, bla_real *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(z,tbsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_integer *k, const bla_dcomplex *a, const bla_integer *lda, bla_dcomplex *x, const bla_integer *incx);
diff --git a/frame/compat/f2c/bla_tpmv.c b/frame/compat/f2c/bla_tpmv.c
index 8fa46f4c4f..27ba219c93 100644
--- a/frame/compat/f2c/bla_tpmv.c
+++ b/frame/compat/f2c/bla_tpmv.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* ctpmv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5;
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -167,14 +167,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -183,7 +183,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CTPMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("CTPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -193,8 +193,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -208,11 +208,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x:= A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -346,7 +346,7 @@
 
 /*        Form  x := A'*x  or  x := conjg( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -542,7 +542,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -551,9 +552,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -663,14 +664,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -679,7 +680,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DTPMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("DTPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -689,7 +690,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -703,11 +704,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x:= A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -797,7 +798,7 @@
 
 /*        Form  x := A'*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -890,7 +891,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -899,9 +901,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1011,14 +1013,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1027,7 +1029,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("STPMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("STPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1037,7 +1039,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1051,11 +1053,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x:= A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1145,7 +1147,7 @@
 
 /*        Form  x := A'*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1238,7 +1240,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5;
@@ -1251,9 +1254,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1364,14 +1367,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1380,7 +1383,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZTPMV ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZTPMV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1390,8 +1393,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1405,11 +1408,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x:= A*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1543,7 +1546,7 @@
 
 /*        Form  x := A'*x  or  x := conjg( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1734,5 +1737,26 @@
 
 } /* ztpmv_ */
 
-#endif
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx)
+{
+  return PASTEF77S(s,tpmv)( uplo, trans, diag, n, ap, x, incx );
+}
 
+int PASTEF77(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx)
+{
+  return PASTEF77S(d,tpmv)( uplo, trans, diag, n, ap, x, incx );
+}
+
+int PASTEF77(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx)
+{
+  return PASTEF77S(c,tpmv)( uplo, trans, diag, n, ap, x, incx );
+}
+
+int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx)
+{
+  return PASTEF77S(z,tpmv)( uplo, trans, diag, n, ap, x, incx );
+}
+
+#endif
diff --git a/frame/compat/f2c/bla_tpmv.h b/frame/compat/f2c/bla_tpmv.h
index 2376ecfe33..121982de73 100644
--- a/frame/compat/f2c/bla_tpmv.h
+++ b/frame/compat/f2c/bla_tpmv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,3 +41,8 @@ BLIS_EXPORT_BLAS int PASTEF77(s,tpmv)(const bla_character *uplo, const bla_chara
 BLIS_EXPORT_BLAS int PASTEF77(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(d,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(s,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(z,tpmv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx);
diff --git a/frame/compat/f2c/bla_tpsv.c b/frame/compat/f2c/bla_tpsv.c
index 0764940979..1c7b22271d 100644
--- a/frame/compat/f2c/bla_tpsv.c
+++ b/frame/compat/f2c/bla_tpsv.c
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -34,14 +35,13 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* ctpsv.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5;
@@ -54,9 +54,9 @@
     bla_integer info;
     bla_scomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -170,14 +170,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -186,7 +186,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("CTPSV ", &info, (ftnlen)6);
+	PASTE_XERBLA("CTPSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -196,8 +196,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -211,11 +211,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -339,7 +339,7 @@
 
 /*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -534,7 +534,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -543,9 +544,9 @@
     bla_integer info;
     bla_double temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -658,14 +659,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -674,7 +675,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("DTPSV ", &info, (ftnlen)6);
+	PASTE_XERBLA("DTPSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -684,7 +685,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -698,11 +699,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -790,7 +791,7 @@
 
 /*        Form  x := inv( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -885,7 +886,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer i__1, i__2;
@@ -894,9 +896,9 @@
     bla_integer info;
     bla_real temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1009,14 +1011,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1025,7 +1027,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("STPSV ", &info, (ftnlen)6);
+	PASTE_XERBLA("STPSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1035,7 +1037,7 @@
 	return 0;
     }
 
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1049,11 +1051,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1141,7 +1143,7 @@
 
 /*        Form  x := inv( A' )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1236,7 +1238,8 @@
 	-lf2c -lm   (in that order)
 */
 
-/* Subroutine */ int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx)
+/* Subroutine */ 
+int PASTEF77S(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx)
 {
     /* System generated locals */
     bla_integer i__1, i__2, i__3, i__4, i__5;
@@ -1250,9 +1253,9 @@
     bla_integer info;
     bla_dcomplex temp;
     bla_integer i__, j, k;
-    //extern bla_logical PASTEF770(lsame)(bla_character *, bla_character *, ftnlen, ftnlen);
+    //extern bla_logical PASTE_LSAME(bla_character *, bla_character *, ftnlen, ftnlen);
     bla_integer kk, ix, jx, kx = 0;
-    //extern /* Subroutine */ int PASTEF770(xerbla)(bla_character *, bla_integer *, ftnlen);
+    //extern /* Subroutine */ int PASTE_XERBLA(bla_character *, bla_integer *, ftnlen);
     bla_logical noconj, nounit;
 
 /*     .. Scalar Arguments .. */
@@ -1366,14 +1369,14 @@
 
     /* Function Body */
     info = 0;
-    if (! PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(uplo, "L", (
+    if (! PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(uplo, "L", (
 	    ftnlen)1, (ftnlen)1)) {
 	info = 1;
-    } else if (! PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, 
-	    "T", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(trans, "C", (ftnlen)1, (
+    } else if (! PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, 
+	    "T", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(trans, "C", (ftnlen)1, (
 	    ftnlen)1)) {
 	info = 2;
-    } else if (! PASTEF770(lsame)(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTEF770(lsame)(diag, 
+    } else if (! PASTE_LSAME(diag, "U", (ftnlen)1, (ftnlen)1) && ! PASTE_LSAME(diag, 
 	    "N", (ftnlen)1, (ftnlen)1)) {
 	info = 3;
     } else if (*n < 0) {
@@ -1382,7 +1385,7 @@
 	info = 7;
     }
     if (info != 0) {
-	PASTEF770(xerbla)("ZTPSV ", &info, (ftnlen)6);
+	PASTE_XERBLA("ZTPSV ", &info, (ftnlen)6);
 	return 0;
     }
 
@@ -1392,8 +1395,8 @@
 	return 0;
     }
 
-    noconj = PASTEF770(lsame)(trans, "T", (ftnlen)1, (ftnlen)1);
-    nounit = PASTEF770(lsame)(diag, "N", (ftnlen)1, (ftnlen)1);
+    noconj = PASTE_LSAME(trans, "T", (ftnlen)1, (ftnlen)1);
+    nounit = PASTE_LSAME(diag, "N", (ftnlen)1, (ftnlen)1);
 
 /*     Set up the start point in X if the increment is not unity. This */
 /*     will be  ( N - 1 )*INCX  too small for descending loops. */
@@ -1407,11 +1410,11 @@
 /*     Start the operations. In this version the elements of AP are */
 /*     accessed sequentially with one pass through AP. */
 
-    if (PASTEF770(lsame)(trans, "N", (ftnlen)1, (ftnlen)1)) {
+    if (PASTE_LSAME(trans, "N", (ftnlen)1, (ftnlen)1)) {
 
 /*        Form  x := inv( A )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = *n * (*n + 1) / 2;
 	    if (*incx == 1) {
 		for (j = *n; j >= 1; --j) {
@@ -1535,7 +1538,7 @@
 
 /*        Form  x := inv( A' )*x  or  x := inv( conjg( A' ) )*x. */
 
-	if (PASTEF770(lsame)(uplo, "U", (ftnlen)1, (ftnlen)1)) {
+	if (PASTE_LSAME(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	    kk = 1;
 	    if (*incx == 1) {
 		i__1 = *n;
@@ -1725,5 +1728,27 @@
 
 } /* ztpsv_ */
 
+#ifdef BLIS_ENABLE_BLAS
+
+int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx)
+{
+  return PASTEF77S(s,tpsv)( uplo, trans, diag, n, ap, x, incx );
+}
+
+int PASTEF77(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx)
+{
+  return PASTEF77S(d,tpsv)( uplo, trans, diag, n, ap, x, incx );
+}
+
+int PASTEF77(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx)
+{
+  return PASTEF77S(c,tpsv)( uplo, trans, diag, n, ap, x, incx );
+}
+
+int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx)
+{
+  return PASTEF77S(z,tpsv)( uplo, trans, diag, n, ap, x, incx );
+}
+
 #endif
 
diff --git a/frame/compat/f2c/bla_tpsv.h b/frame/compat/f2c/bla_tpsv.h
index 77bd55979a..4e03b841a6 100644
--- a/frame/compat/f2c/bla_tpsv.h
+++ b/frame/compat/f2c/bla_tpsv.h
@@ -5,7 +5,8 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+   
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -40,3 +41,8 @@ BLIS_EXPORT_BLAS int PASTEF77(s,tpsv)(const bla_character *uplo, const bla_chara
 BLIS_EXPORT_BLAS int PASTEF77(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx);
 
 #endif
+
+BLIS_EXPORT_BLAS int PASTEF77S(c,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_scomplex *ap, bla_scomplex *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(d,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_double *ap, bla_double *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(s,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_real *ap, bla_real *x, const bla_integer *incx);
+BLIS_EXPORT_BLAS int PASTEF77S(z,tpsv)(const bla_character *uplo, const bla_character *trans, const bla_character *diag, const bla_integer *n, const bla_dcomplex *ap, bla_dcomplex *x, const bla_integer *incx);
diff --git a/frame/compat/f2c/bla_xerbla.c b/frame/compat/f2c/bla_xerbla.c
index a6500c4433..62dd6b5edf 100644
--- a/frame/compat/f2c/bla_xerbla.c
+++ b/frame/compat/f2c/bla_xerbla.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,8 +35,6 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 /* xerbla.f -- translated by f2c (version 19991025).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
@@ -43,7 +42,7 @@
 
 /* Table of constant values */
 
-/* Subroutine */ int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len)
+/* Subroutine */ int xerbla_blis_impl(const bla_character *srname, const bla_integer *info, ftnlen srname_len)
 {
 /*  -- LAPACK auxiliary routine (preliminary version) -- */
 /*     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., */
@@ -85,7 +84,14 @@
 /*     End of XERBLA */
 
     return 0;
-} /* xerbla */
+} /* xerbla_blis_impl */
+
 
+#ifdef BLIS_ENABLE_BLAS
+/* Subroutine */ int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len)
+{
+    xerbla_blis_impl(srname, info, srname_len);
+    return 0;
+} /* xerbla */
 #endif
 
diff --git a/frame/compat/f2c/bla_xerbla.h b/frame/compat/f2c/bla_xerbla.h
index 44c168e584..72f9b7592d 100644
--- a/frame/compat/f2c/bla_xerbla.h
+++ b/frame/compat/f2c/bla_xerbla.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,6 +33,8 @@
 
 */
 
+BLIS_EXPORT_BLAS int xerbla_blis_impl(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
+
 #ifdef BLIS_ENABLE_BLAS
 
 BLIS_EXPORT_BLAS int PASTEF770(xerbla)(const bla_character *srname, const bla_integer *info, ftnlen srname_len);
diff --git a/frame/compat/f2c/bla_xerbla_array.c b/frame/compat/f2c/bla_xerbla_array.c
index 722bb29144..2521cd5d23 100644
--- a/frame/compat/f2c/bla_xerbla_array.c
+++ b/frame/compat/f2c/bla_xerbla_array.c
@@ -34,11 +34,9 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 #define MAX_NUM_CHARS 32
 
-int PASTEF770(xerbla_array)(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info)
+int xerbla_array_blis_impl(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info)
 {
 	int  i;
 #if 1
@@ -65,10 +63,16 @@ int PASTEF770(xerbla_array)(const bla_character *srname_array, const bla_integer
 	srname[i] = '\0';
 
 	// Call xerbla_().
-	PASTEF770(xerbla)( srname, info, ( ftnlen )srname_len );
+	PASTE_XERBLA( srname, info, ( ftnlen )srname_len );
 
 	return 0;
 }
 
+
+#ifdef BLIS_ENABLE_BLAS
+int PASTEF770(xerbla_array)(const bla_character *srname_array, const bla_integer srname_len, const bla_integer *info)
+{
+  return xerbla_array_blis_impl(srname_array, srname_len, info);
+}
 #endif
 
diff --git a/frame/compat/f2c/bla_xerbla_array.h b/frame/compat/f2c/bla_xerbla_array.h
index 6a4b4e0598..f007fadc1d 100644
--- a/frame/compat/f2c/bla_xerbla_array.h
+++ b/frame/compat/f2c/bla_xerbla_array.h
@@ -32,6 +32,8 @@
 
 */
 
+BLIS_EXPORT_BLAS int xerbla_array_blis_impl(const bla_character *srname, const bla_integer srname_len, const bla_integer *info);
+
 #ifdef BLIS_ENABLE_BLAS
 
 BLIS_EXPORT_BLAS int PASTEF770(xerbla_array)(const bla_character *srname, const bla_integer srname_len, const bla_integer *info);
diff --git a/frame/compat/f2c/util/bla_c_abs.c b/frame/compat/f2c/util/bla_c_abs.c
index 94f7a58da7..1b90d3fb09 100644
--- a/frame/compat/f2c/util/bla_c_abs.c
+++ b/frame/compat/f2c/util/bla_c_abs.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,13 +35,9 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_c_abs(const bla_scomplex *z)
 {
 	return( bla_f__cabs( bli_creal( *z ),
 	                     bli_cimag( *z ) ) );
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_c_abs.h b/frame/compat/f2c/util/bla_c_abs.h
index b4eb510ddb..8e7944e162 100644
--- a/frame/compat/f2c/util/bla_c_abs.h
+++ b/frame/compat/f2c/util/bla_c_abs.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_c_abs(const bla_scomplex *z);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_c_div.c b/frame/compat/f2c/util/bla_c_div.c
index 975f49b0a4..152251b908 100644
--- a/frame/compat/f2c/util/bla_c_div.c
+++ b/frame/compat/f2c/util/bla_c_div.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,13 +35,9 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp)
 {
 	bli_ccopys( *ap, *cp );
 	bli_cinvscals( *bp, *cp );
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_c_div.h b/frame/compat/f2c/util/bla_c_div.h
index 14497715d6..843ef07f4f 100644
--- a/frame/compat/f2c/util/bla_c_div.h
+++ b/frame/compat/f2c/util/bla_c_div.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 void bla_c_div(bla_scomplex *cp, const bla_scomplex *ap, const bla_scomplex *bp);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_d_abs.c b/frame/compat/f2c/util/bla_d_abs.c
index 94031bd964..4f633edeeb 100644
--- a/frame/compat/f2c/util/bla_d_abs.c
+++ b/frame/compat/f2c/util/bla_d_abs.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,8 +35,6 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_d_abs(const bla_double *x)
 {
 	if(*x >= 0.0)
@@ -44,5 +43,3 @@ double bla_d_abs(const bla_double *x)
 	return(- *x);
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_d_abs.h b/frame/compat/f2c/util/bla_d_abs.h
index e9b3f1dc1f..63ec465ef1 100644
--- a/frame/compat/f2c/util/bla_d_abs.h
+++ b/frame/compat/f2c/util/bla_d_abs.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_d_abs(const bla_double *x);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_d_cnjg.c b/frame/compat/f2c/util/bla_d_cnjg.c
index 43dc9758c3..1120d85c54 100644
--- a/frame/compat/f2c/util/bla_d_cnjg.c
+++ b/frame/compat/f2c/util/bla_d_cnjg.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,12 +35,8 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src)
 {
 	bli_zcopyjs( *src, *dest );
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_d_cnjg.h b/frame/compat/f2c/util/bla_d_cnjg.h
index 38c810910d..fddf456b08 100644
--- a/frame/compat/f2c/util/bla_d_cnjg.h
+++ b/frame/compat/f2c/util/bla_d_cnjg.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 void bla_d_cnjg(bla_dcomplex *dest, const bla_dcomplex *src);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_d_imag.c b/frame/compat/f2c/util/bla_d_imag.c
index 53f997cf79..2cf2fe67dd 100644
--- a/frame/compat/f2c/util/bla_d_imag.c
+++ b/frame/compat/f2c/util/bla_d_imag.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,12 +35,8 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_d_imag(const bla_dcomplex *z)
 {
 	return bli_zimag( *z );
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_d_imag.h b/frame/compat/f2c/util/bla_d_imag.h
index 913b84c167..beefa4c24d 100644
--- a/frame/compat/f2c/util/bla_d_imag.h
+++ b/frame/compat/f2c/util/bla_d_imag.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_d_imag(const bla_dcomplex *z);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_d_sign.c b/frame/compat/f2c/util/bla_d_sign.c
index 08c2dae531..bae10453b6 100644
--- a/frame/compat/f2c/util/bla_d_sign.c
+++ b/frame/compat/f2c/util/bla_d_sign.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,8 +35,6 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_d_sign(const bla_double *a, const bla_double *b)
 {
 	double x = (*a >= 0.0 ? *a : - *a);
@@ -43,5 +42,3 @@ double bla_d_sign(const bla_double *a, const bla_double *b)
 	return(*b >= 0.0 ? x : -x);
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_d_sign.h b/frame/compat/f2c/util/bla_d_sign.h
index 25076140c5..6519ce3259 100644
--- a/frame/compat/f2c/util/bla_d_sign.h
+++ b/frame/compat/f2c/util/bla_d_sign.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_d_sign(const bla_double *a, const bla_double *b);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_f__cabs.c b/frame/compat/f2c/util/bla_f__cabs.c
index 93888aaa09..26f0242e89 100644
--- a/frame/compat/f2c/util/bla_f__cabs.c
+++ b/frame/compat/f2c/util/bla_f__cabs.c
@@ -34,8 +34,6 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_f__cabs(double real_val, double imag_val)
 {
 	double temp;
@@ -60,5 +58,3 @@ double bla_f__cabs(double real_val, double imag_val)
 	return(temp);
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_f__cabs.h b/frame/compat/f2c/util/bla_f__cabs.h
index ffa4395180..d74ceebcd8 100644
--- a/frame/compat/f2c/util/bla_f__cabs.h
+++ b/frame/compat/f2c/util/bla_f__cabs.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_f__cabs(double real, double imag);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_r_abs.c b/frame/compat/f2c/util/bla_r_abs.c
index b021570a0b..8208492f07 100644
--- a/frame/compat/f2c/util/bla_r_abs.c
+++ b/frame/compat/f2c/util/bla_r_abs.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,8 +35,6 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_r_abs(const bla_real *x)
 {
 	if(*x >= 0.0)
@@ -44,5 +43,3 @@ double bla_r_abs(const bla_real *x)
 	return(- *x);
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_r_abs.h b/frame/compat/f2c/util/bla_r_abs.h
index 636c0ed215..a6672bc90e 100644
--- a/frame/compat/f2c/util/bla_r_abs.h
+++ b/frame/compat/f2c/util/bla_r_abs.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_r_abs(const bla_real *x);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_r_cnjg.c b/frame/compat/f2c/util/bla_r_cnjg.c
index 42b25d5757..5c073f58df 100644
--- a/frame/compat/f2c/util/bla_r_cnjg.c
+++ b/frame/compat/f2c/util/bla_r_cnjg.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,12 +35,8 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src)
 {
 	bli_ccopyjs( *src, *dest );
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_r_cnjg.h b/frame/compat/f2c/util/bla_r_cnjg.h
index 5ee38843f4..ebd1bf2006 100644
--- a/frame/compat/f2c/util/bla_r_cnjg.h
+++ b/frame/compat/f2c/util/bla_r_cnjg.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 void bla_r_cnjg(bla_scomplex *dest, const bla_scomplex *src);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_r_imag.c b/frame/compat/f2c/util/bla_r_imag.c
index 483ce318fa..42b5b33135 100644
--- a/frame/compat/f2c/util/bla_r_imag.c
+++ b/frame/compat/f2c/util/bla_r_imag.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,12 +35,8 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 bla_real bla_r_imag(const bla_scomplex *z)
 {
 	return bli_cimag( *z );
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_r_imag.h b/frame/compat/f2c/util/bla_r_imag.h
index 6918660ed5..a4f33c16b2 100644
--- a/frame/compat/f2c/util/bla_r_imag.h
+++ b/frame/compat/f2c/util/bla_r_imag.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 bla_real bla_r_imag(const bla_scomplex *z);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_r_sign.c b/frame/compat/f2c/util/bla_r_sign.c
index a0c8a363c1..13eb68ba40 100644
--- a/frame/compat/f2c/util/bla_r_sign.c
+++ b/frame/compat/f2c/util/bla_r_sign.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,8 +35,6 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_r_sign(const bla_real *a, const bla_real *b)
 {
 	double x = (*a >= 0.0 ? *a : - *a);
@@ -43,5 +42,3 @@ double bla_r_sign(const bla_real *a, const bla_real *b)
 	return(*b >= 0.0 ? x : -x);
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_r_sign.h b/frame/compat/f2c/util/bla_r_sign.h
index 0323777086..dd008305cd 100644
--- a/frame/compat/f2c/util/bla_r_sign.h
+++ b/frame/compat/f2c/util/bla_r_sign.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_r_sign(const bla_real *a, const bla_real *b);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_z_abs.c b/frame/compat/f2c/util/bla_z_abs.c
index 6550a312d7..5d1096f629 100644
--- a/frame/compat/f2c/util/bla_z_abs.c
+++ b/frame/compat/f2c/util/bla_z_abs.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,13 +35,9 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_z_abs(const bla_dcomplex *z)
 {
 	return( bla_f__cabs( bli_zreal( *z ),
 	                     bli_zimag( *z ) ) );
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_z_abs.h b/frame/compat/f2c/util/bla_z_abs.h
index b84b073fe5..03d5f54211 100644
--- a/frame/compat/f2c/util/bla_z_abs.h
+++ b/frame/compat/f2c/util/bla_z_abs.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 double bla_z_abs(const bla_dcomplex *z);
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_z_div.c b/frame/compat/f2c/util/bla_z_div.c
index 3d36a8ac89..edbdf72cf2 100644
--- a/frame/compat/f2c/util/bla_z_div.c
+++ b/frame/compat/f2c/util/bla_z_div.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,13 +35,9 @@
 
 #include "blis.h"
 
-#ifdef BLIS_ENABLE_BLAS
-
 void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp)
 {
 	bli_zcopys( *ap, *cp );
 	bli_zinvscals( *bp, *cp );
 }
 
-#endif
-
diff --git a/frame/compat/f2c/util/bla_z_div.h b/frame/compat/f2c/util/bla_z_div.h
index bec56bb5fd..1d31d178c5 100644
--- a/frame/compat/f2c/util/bla_z_div.h
+++ b/frame/compat/f2c/util/bla_z_div.h
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,9 +33,5 @@
 
 */
 
-#ifdef BLIS_ENABLE_BLAS
-
 void bla_z_div(bla_dcomplex *cp, const bla_dcomplex *ap, const bla_dcomplex *bp);
 
-#endif
-
diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h
index dd6e8f6062..b9df252bda 100644
--- a/frame/include/bli_config_macro_defs.h
+++ b/frame/include/bli_config_macro_defs.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -171,6 +171,16 @@
   #define BLIS_ENABLE_BLAS
 #endif
 
+#ifdef BLIS_ENABLE_BLAS
+  #define IF_BLIS_ENABLE_BLAS(...) __VA_ARGS__
+  #define PASTE_LSAME PASTEF770(lsame)
+  #define PASTE_XERBLA PASTEF770(xerbla)
+#else
+  #define IF_BLIS_ENABLE_BLAS(...)
+  #define PASTE_LSAME lsame_blis_impl
+  #define PASTE_XERBLA xerbla_blis_impl
+#endif
+
 // The bit size of the integer type used to track values such as dimensions and
 // leading dimensions (ie: column strides) within the BLAS compatibility layer.
 // A value of 32 results in the compatibility layer using 32-bit signed integers
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index 49c79cb8ae..96a658110d 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -154,13 +154,10 @@ GENTFUNCR2( dcomplex, double, z, d, blasname, blisname )
 
 // -- Extended two-operand macro (used only for scal) --
 
-
-#define INSERT_GENTFUNCSCAL_BLAS_CZ( blasname, blisname ) \
+#define INSERT_GENTFUNCSCAL_BLAS_C( blasname, blisname ) \
 \
 GENTFUNCSCAL( scomplex, scomplex, c,  , blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, dcomplex, z,  , blasname, blisname ) \
-GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname ) \
-GENTFUNCSCAL( dcomplex, double,   z, d, blasname, blisname )
+GENTFUNCSCAL( scomplex, float,    c, s, blasname, blisname )
 
 
 #define INSERT_GENTFUNCSCAL_BLAS( blasname, blisname ) \
diff --git a/frame/include/bli_gentprot_macro_defs.h b/frame/include/bli_gentprot_macro_defs.h
index f9d1caa56a..703bef68d8 100644
--- a/frame/include/bli_gentprot_macro_defs.h
+++ b/frame/include/bli_gentprot_macro_defs.h
@@ -1,3 +1,4 @@
+
 /*
 
    BLIS
@@ -5,7 +6,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -56,6 +57,11 @@ GENTPROT( double,   d, blasname ) \
 GENTPROT( scomplex, c, blasname ) \
 GENTPROT( dcomplex, z, blasname )
 
+#define INSERT_GENTPROT_BLAS_CZ( blasname ) \
+\
+GENTPROT( scomplex, c, blasname ) \
+GENTPROT( dcomplex, z, blasname )
+
 
 // -- Basic one-operand macro with real domain only --
 
diff --git a/frame/include/bli_macro_defs.h b/frame/include/bli_macro_defs.h
index f29fdc1fe4..e131acb4ac 100644
--- a/frame/include/bli_macro_defs.h
+++ b/frame/include/bli_macro_defs.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -158,17 +158,25 @@
 #define PASTEMACT(ch1, ch2, ch3, ch4)   bli_ ## ch1 ## ch2 ## _ ## ch3 ## _ ## ch4
 // name-mangling macros.
 #ifdef BLIS_ENABLE_NO_UNDERSCORE_API
-#define PASTEF770(name)                                      name
-#define PASTEF77(ch1,name)                     ch1        ## name
-#define PASTEF772(ch1,ch2,name)                ch1 ## ch2 ## name
-#define PASTEF773(ch1,ch2,ch3,name)     ch1 ## ch2 ## ch3 ## name
+#define PASTEF770(name)                                                  name
+#define PASTEF77(ch1,name)                                        ch1 ## name
+#define PASTEF772(ch1,ch2,name)                            ch1 ## ch2 ## name
+#define PASTEF773(ch1,ch2,ch3,name)                 ch1 ## ch2 ## ch3 ## name
 #else
-#define PASTEF770(name)                                      name ## _
-#define PASTEF77(ch1,name)                     ch1        ## name ## _
-#define PASTEF772(ch1,ch2,name)                ch1 ## ch2 ## name ## _
-#define PASTEF773(ch1,ch2,ch3,name)     ch1 ## ch2 ## ch3 ## name ## _
+#define PASTEF770(name)                                            name ## _
+#define PASTEF77(ch1,name)                                  ch1 ## name ## _
+#define PASTEF772(ch1,ch2,name)                      ch1 ## ch2 ## name ## _
+#define PASTEF773(ch1,ch2,ch3,name)           ch1 ## ch2 ## ch3 ## name ## _
 #endif
 
+// Macros to define names _blis_impl suffix, *_blis_impl is the blis
+// blis implmenation of the respective API's which is invoked from CBLAS
+// and BLAS wrapper. 
+#define PASTEF770S(name)                                   name ## _blis_impl
+#define PASTEF77S(ch1,name)                         ch1 ## name ## _blis_impl
+#define PASTEF772S(ch1,ch2,name)             ch1 ## ch2 ## name ## _blis_impl
+#define PASTEF773S(ch1,ch2,ch3,name)  ch1 ## ch2 ## ch3 ## name ## _blis_impl
+
 // -- Include other groups of macros
 
 #include "bli_genarray_macro_defs.h"
@@ -188,7 +196,10 @@
 #include "bli_oapi_macro_defs.h"
 #include "bli_tapi_macro_defs.h"
 
+
 #ifdef BLIS_ENABLE_NO_UNDERSCORE_API
+
+#ifdef BLIS_ENABLE_BLAS
 #define isamax_ isamax
 #define idamax_ idamax
 #define icamax_ icamax
@@ -299,6 +310,7 @@
 #define ctrsm_  ctrsm
 #define ztrsm_  ztrsm
 #define lsame_  lsame
+
 #define cimatcopy_    cimatcopy
 #define comatadd_     comatadd
 #define comatcopy2_   comatcopy2
@@ -315,9 +327,14 @@
 #define zomatadd_     zomatadd
 #define zomatcopy2_   zomatcopy2
 #define zomatcopy_    zomatcopy
-#endif
+
+#endif // BLIS_ENABLE_BLAS
+#endif // BLIS_ENABLE_NO_UNDERSCORE_API
+
 
 #ifdef BLIS_ENABLE_UPPERCASE_API
+
+#ifdef BLIS_ENABLE_BLAS
 #define caxpby                    CAXPBY
 #define caxpy                     CAXPY
 #define ccopy                     CCOPY
@@ -528,5 +545,6 @@
 #define ztrsv                     ZTRSV
 #endif
 
-#endif
+#endif // BLIS_ENABLE_BLAS
+#endif // BLIS_ENABLE_UPPERCASE_API
 
diff --git a/frame/include/bli_trsm_small_ref.h b/frame/include/bli_trsm_small_ref.h
new file mode 100644
index 0000000000..715db884e3
--- /dev/null
+++ b/frame/include/bli_trsm_small_ref.h
@@ -0,0 +1,129 @@
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+  #define DIAG_ELE_INV_OPS(a, b) (a / b)
+  #define DIAG_ELE_EVAL_OPS(a, b) (a * b)
+#endif
+
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+  #define DIAG_ELE_INV_OPS(a, b) (a * b)
+  #define DIAG_ELE_EVAL_OPS(a, b) (a / b)
+#endif
+
+// reference code for LUTN
+BLIS_INLINE err_t dtrsm_AutXB_ref
+   (
+      double *A,
+      double *B,
+      dim_t M,
+      dim_t N,
+      dim_t lda,
+      dim_t ldb,
+      bool unitDiagonal
+   )
+{
+  dim_t i, j, k;
+  for (k = 0; k < M; k++)
+  {
+    double lkk_inv = 1.0;
+    if (!unitDiagonal)
+      lkk_inv = DIAG_ELE_INV_OPS(lkk_inv, A[k + k * lda]);
+    for (j = 0; j < N; j++)
+    {
+      B[k + j * ldb] = DIAG_ELE_EVAL_OPS(B[k + j * ldb], lkk_inv);
+      for (i = k + 1; i < M; i++)
+      {
+        B[i + j * ldb] -= A[i * lda + k] * B[k + j * ldb];
+      }
+    }
+  } // k -loop
+  return BLIS_SUCCESS;
+}
+
+// reference code for LLNN
+BLIS_INLINE err_t dtrsm_AlXB_ref
+   (
+      double *A,
+      double *B,
+      dim_t M,
+      dim_t N,
+      dim_t lda,
+      dim_t ldb,
+      bool is_unitdiag
+    )
+{
+  dim_t i, j, k;
+  for (k = 0; k < M; k++)
+  {
+    double lkk_inv = 1.0;
+    if (!is_unitdiag)
+      lkk_inv = DIAG_ELE_INV_OPS(lkk_inv, A[k + k * lda]);
+    for (j = 0; j < N; j++)
+    {
+      B[k + j * ldb] = DIAG_ELE_EVAL_OPS(B[k + j * ldb], lkk_inv);
+      for (i = k + 1; i < M; i++)
+      {
+        B[i + j * ldb] -= A[i + k * lda] * B[k + j * ldb];
+      }
+    }
+  } // k -loop
+  return BLIS_SUCCESS;
+}
+
+// reference code for LUNN
+BLIS_INLINE err_t dtrsm_AuXB_ref
+   (
+     double *A,
+     double *B,
+     dim_t M,
+     dim_t N,
+     dim_t lda,
+     dim_t ldb,
+     bool is_unitdiag
+   )
+{
+  dim_t i, j, k;
+  for (k = M - 1; k >= 0; k--)
+  {
+    double lkk_inv = 1.0;
+    if (!is_unitdiag)
+      lkk_inv = DIAG_ELE_INV_OPS(lkk_inv, A[k + k * lda]);
+    for (j = N - 1; j >= 0; j--)
+    {
+      B[k + j * ldb] = DIAG_ELE_EVAL_OPS(B[k + j * ldb], lkk_inv);
+      for (i = k - 1; i >= 0; i--)
+      {
+        B[i + j * ldb] -= A[i + k * lda] * B[k + j * ldb];
+      }
+    }
+  } // k -loop
+  return BLIS_SUCCESS;
+} // end of function
+
+// reference code for LLTN
+BLIS_INLINE err_t dtrsm_AltXB_ref
+   (
+     double *A,
+     double *B,
+     dim_t M,
+     dim_t N,
+     dim_t lda,
+     dim_t ldb,
+     bool is_unitdiag
+   )
+{
+  dim_t i, j, k;
+  for (k = M - 1; k >= 0; k--)
+  {
+    double lkk_inv = 1.0;
+    if (!is_unitdiag)
+      lkk_inv = DIAG_ELE_INV_OPS(lkk_inv, A[k + k * lda]);
+    for (j = N - 1; j >= 0; j--)
+    {
+      B[k + j * ldb] = DIAG_ELE_EVAL_OPS(B[k + j * ldb], lkk_inv);
+      for (i = k - 1; i >= 0; i--)
+      {
+        B[i + j * ldb] -= A[i * lda + k] * B[k + j * ldb];
+      }
+    }
+  } // k -loop
+  return BLIS_SUCCESS;
+} // end of function
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 89f9aada33..4a4dc0ec4e 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -6,7 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2021 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -756,6 +756,7 @@ typedef enum
 	BLIS_PACKM_29XK_KER = 29,
 	BLIS_PACKM_30XK_KER = 30,
 	BLIS_PACKM_31XK_KER = 31,
+	BLIS_PACKM_32XK_KER = 32,
 
 	BLIS_UNPACKM_0XK_KER  = 0,
 	BLIS_UNPACKM_1XK_KER  = 1,
@@ -792,7 +793,7 @@ typedef enum
 
 } l1mkr_t;
 
-#define BLIS_NUM_PACKM_KERS   32
+#define BLIS_NUM_PACKM_KERS   33
 #define BLIS_NUM_UNPACKM_KERS 32
 
 
@@ -803,7 +804,7 @@ typedef enum
 	BLIS_GEMMTRSM_U_UKR,
 	BLIS_TRSM_L_UKR,
 	BLIS_TRSM_U_UKR,
-	BLIS_GEMM_AVX2_UKR
+	BLIS_GEMM_FOR_TRSM_UKR
 } l3ukr_t;
 
 #define BLIS_NUM_LEVEL3_UKRS 6
@@ -1045,6 +1046,32 @@ typedef enum
 #define BLIS_NUM_ARCHS (BLIS_ARCH_GENERIC_LAST + 1)
 
 
+typedef enum
+{
+	// Initial value, will be selected for an unrecognized (non-integer)
+	// value of BLIS_MODEL_TYPE
+	BLIS_MODEL_ERROR,
+
+	// Default model
+	BLIS_MODEL_DEFAULT,
+
+	// AMD Zen4
+	BLIS_MODEL_GENOA,
+	BLIS_MODEL_BERGAMO,
+	BLIS_MODEL_GENOA_X,
+
+	// AMD Zen3
+	BLIS_MODEL_MILAN,
+	BLIS_MODEL_MILAN_X,
+
+	// Dummy value, always the last one.
+	// In config_name in bli_arch.c this is also set to "generic"
+	BLIS_MODEL_DEFAULT_LAST
+
+} model_t;
+
+#define BLIS_NUM_MODELS (BLIS_MODEL_DEFAULT_LAST + 1)
+
 //
 // -- BLIS misc. structure types -----------------------------------------------
 //
@@ -1599,6 +1626,7 @@ typedef enum
 	// Architecture-related errors
 	BLIS_INVALID_ARCH_ID                       = (-150),
 	BLIS_UNINITIALIZED_GKS_CNTX                = (-151),
+	BLIS_INVALID_MODEL_ID                      = (-152),
 
 	// Blocksize-related errors
 	BLIS_MC_DEF_NONMULTIPLE_OF_MR              = (-160),
diff --git a/frame/include/bli_x86_asm_macros.h b/frame/include/bli_x86_asm_macros.h
index ffb2771758..84bc76c21d 100644
--- a/frame/include/bli_x86_asm_macros.h
+++ b/frame/include/bli_x86_asm_macros.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2018, The University of Texas at Austin
-   Copyright (C) 2019-22, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -884,6 +884,7 @@
 #define VADDSUBPS(_0, _1, _2) INSTR_(vaddsubps, _0, _1, _2)
 #define VADDSUBPD(_0, _1, _2) INSTR_(vaddsubpd, _0, _1, _2)
 #define VHADDPD(_0, _1, _2) INSTR_(vhaddpd, _0, _1, _2)
+#define VHSUBPD(_0, _1, _2) INSTR_(vhsubpd, _0, _1, _2)
 #define VHADDPS(_0, _1, _2) INSTR_(vhaddps, _0, _1, _2)
 #define VADDPS(_0, _1, _2) INSTR_(vaddps, _0, _1, _2)
 #define VADDPD(_0, _1, _2) INSTR_(vaddpd, _0, _1, _2)
@@ -1014,6 +1015,7 @@
 #define vaddsubps(_0, _1, _2) VADDSUBPS(_0, _1, _2)
 #define vaddsubpd(_0, _1, _2) VADDSUBPD(_0, _1, _2)
 #define vhaddpd(_0, _1, _2) VHADDPD(_0, _1, _2)
+#define vhsubpd(_0, _1, _2) VHSUBPD(_0, _1, _2)
 #define vhaddps(_0, _1, _2) VHADDPS(_0, _1, _2)
 #define vaddps(_0, _1, _2) VADDPS(_0, _1, _2)
 #define vaddpd(_0, _1, _2) VADDPD(_0, _1, _2)
@@ -1201,7 +1203,7 @@
 #define VEXTRACTF128(_0, _1, _2) INSTR_(vextractf128, _0, _1, _2)
 #define VEXTRACTF32X4(_0, _1, _2) INSTR_(vextractf32x4, _0, _1, _2)
 #define VEXTRACTF32X8(_0, _1, _2) INSTR_(vextractf32x8, _0, _1, _2)
-#define VEXTRACTF64X2(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2)
+#define VEXTRACTF64X2(_0, _1, _2) INSTR_(vextractf64x2, _0, _1, _2)
 #define VEXTRACTF64X4(_0, _1, _2) INSTR_(vextractf64x4, _0, _1, _2)
 #define VBLENDPS(_0, _1, _2, _3) INSTR_(vblendps, _0, _1, _2, _3)
 #define VBLENDPD(_0, _1, _2, _3) INSTR_(vblendpd, _0, _1, _2, _3)
@@ -1218,18 +1220,18 @@
 #define vunpckhps(_0, _1, _2) VUNPCKHPS(_0, _1, _2)
 #define vunpcklpd(_0, _1, _2) VUNPCKLPD(_0, _1, _2)
 #define vunpckhpd(_0, _1, _2) VUNPCKHPD(_0, _1, _2)
-#define vshuff32x4(_0, _1, _2, _3) VSHUFF32x4(_0, _1, _2, _3)
-#define vshuff64x2(_0, _1, _2, _3) VSHUFF64x2(_0, _1, _2, _3)
+#define vshuff32x4(_0, _1, _2, _3) VSHUFF32X4(_0, _1, _2, _3)
+#define vshuff64x2(_0, _1, _2, _3) VSHUFF64X2(_0, _1, _2, _3)
 #define vinsertf128(_0, _1, _2, _3) VINSERTF128(_0, _1, _2, _3)
-#define vinsertf32x4(_0, _1, _2, _3) VINSERTF32x4(_0, _1, _2, _3)
-#define vinsertf32x8(_0, _1, _2, _3) VINSERTF32x8(_0, _1, _2, _3)
-#define vinsertf64x2(_0, _1, _2, _3) VINSERTF64x2(_0, _1, _2, _3)
-#define vinsertf64x4(_0, _1, _2, _3) VINSERTF64x4(_0, _1, _2, _3)
+#define vinsertf32x4(_0, _1, _2, _3) VINSERTF32X4(_0, _1, _2, _3)
+#define vinsertf32x8(_0, _1, _2, _3) VINSERTF32X8(_0, _1, _2, _3)
+#define vinsertf64x2(_0, _1, _2, _3) VINSERTF64X2(_0, _1, _2, _3)
+#define vinsertf64x4(_0, _1, _2, _3) VINSERTF64X4(_0, _1, _2, _3)
 #define vextractf128(_0, _1, _2) VEXTRACTF128(_0, _1, _2)
-#define vextractf32x4(_0, _1, _2) VEXTRACTF32x4(_0, _1, _2)
-#define vextractf32x8(_0, _1, _2) VEXTRACTF32x8(_0, _1, _2)
-#define vextractf64x2(_0, _1, _2) VEXTRACTF64x2(_0, _1, _2)
-#define vextractf64x4(_0, _1, _2) VEXTRACTF64x4(_0, _1, _2)
+#define vextractf32x4(_0, _1, _2) VEXTRACTF32X4(_0, _1, _2)
+#define vextractf32x8(_0, _1, _2) VEXTRACTF32X8(_0, _1, _2)
+#define vextractf64x2(_0, _1, _2) VEXTRACTF64X2(_0, _1, _2)
+#define vextractf64x4(_0, _1, _2) VEXTRACTF64X4(_0, _1, _2)
 #define vblendps(_0, _1, _2, _3) VBLENDPS(_0, _1, _2, _3)
 #define vblendpd(_0, _1, _2, _3) VBLENDPD(_0, _1, _2, _3)
 #define vblendmps(_0, _1, _2) VBLENDMSD(_0, _1, _2)
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
index b01c208a30..14b2bd04af 100644
--- a/frame/thread/bli_l3_decor_openmp.c
+++ b/frame/thread/bli_l3_decor_openmp.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -145,7 +145,7 @@ void bli_l3_thread_decorator
 		tls_aoclprogress_counter = 0;
 
 		// We send the update only after certain threshold is reached, 
-		// The thresold is defined as AOCL_PROGRESS_FREQUENCY. 
+		// The threshold is defined as AOCL_PROGRESS_FREQUENCY.
 		// This variable stores the counter value when last update was sent. 
 		// It is compared with current counter value to see if it is time to
 		// send the next update.
diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c
index 444583e73e..0a4d16c223 100644
--- a/frame/thread/bli_l3_decor_single.c
+++ b/frame/thread/bli_l3_decor_single.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -121,7 +121,7 @@ void bli_l3_thread_decorator
 		tls_aoclprogress_counter = 0;
 		
 		// We send the update only after certain threshold is reached, 
-		// The thresold is defined as AOCL_PROGRESS_FREQUENCY. 
+		// The threshold is defined as AOCL_PROGRESS_FREQUENCY.
 		// This variable stores the counter value when last update was sent. 
 		// It is compared with current counter value to see if it is time to
 		// send the next update.
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index f721bae7e6..60c2b37f34 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 22, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -42,6 +42,10 @@ thrcomm_t BLIS_SINGLE_COMM           = {};
 // The global rntm_t structure. (The definition resides in bli_rntm.c.)
 extern rntm_t global_rntm;
 
+// Make thread settings local to each thread calling BLIS routines.
+// (The definition resides in bli_rntm.c.)
+extern BLIS_THREAD_LOCAL rntm_t tl_rntm;
+
 // A mutex to allow synchronous access to global_rntm. (The definition
 // resides in bli_rntm.c.)
 extern bli_pthread_mutex_t global_rntm_mutex;
@@ -54,15 +58,31 @@ void bli_thread_init( void )
 	bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED );
 	bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED );
 
-	// Read the environment variables and use them to initialize the
-	// global runtime object.
+	// Read the BLIS environment variables and (optionally) OpenMP ICVs and
+	// use them to initialize the global runtime object.
+	// The thread local global runtime object will be initialized from this
+	// to ensure all thread local get information from any BLIS environment
+	// variables set, as these are not re-read for performance reasons.
 	bli_thread_init_rntm_from_env( &global_rntm );
+	// Initialize tl_rntm.
+	bli_thread_update_rntm_from_env( &tl_rntm );
+}
+
+void bli_thread_update_tl( void )
+{
+	// Update the thread local global runtime object from any runtime BLIS
+	// or OpenMP calls or nested parallelism.
+	bli_thread_update_rntm_from_env( &tl_rntm );
 }
 
 void bli_thread_finalize( void )
 {
 }
 
+void bli_thread_finalize_tl( void )
+{
+}
+
 // -----------------------------------------------------------------------------
 
 void bli_thread_range_sub
@@ -1510,77 +1530,94 @@ dim_t bli_ipow( dim_t base, dim_t power )
 
 dim_t bli_thread_get_jc_nt( void )
 {
-	// We must ensure that global_rntm has been initialized.
+	// We must ensure that tl_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_jc_ways( &global_rntm );
+	return bli_rntm_jc_ways( &tl_rntm );
 }
 
 dim_t bli_thread_get_pc_nt( void )
 {
-	// We must ensure that global_rntm has been initialized.
+	// We must ensure that tl_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_pc_ways( &global_rntm );
+	return bli_rntm_pc_ways( &tl_rntm );
 }
 
 dim_t bli_thread_get_ic_nt( void )
 {
-	// We must ensure that global_rntm has been initialized.
+	// We must ensure that tl_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_ic_ways( &global_rntm );
+	return bli_rntm_ic_ways( &tl_rntm );
 }
 
 dim_t bli_thread_get_jr_nt( void )
 {
-	// We must ensure that global_rntm has been initialized.
+	// We must ensure that tl_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_jr_ways( &global_rntm );
+	return bli_rntm_jr_ways( &tl_rntm );
 }
 
 dim_t bli_thread_get_ir_nt( void )
 {
-	// We must ensure that global_rntm has been initialized.
+	// We must ensure that tl_rntm has been initialized.
 	bli_init_once();
 
-	return bli_rntm_ir_ways( &global_rntm );
+	return bli_rntm_ir_ways( &tl_rntm );
 }
 
 dim_t bli_thread_get_num_threads( void )
 {
-	// We must ensure that global_rntm has been initialized.
+	// We must ensure that tl_rntm has been initialized.
 	bli_init_once();
+	// Must also update tl_rntm as value may have been updated
+	// by OpenMP or BLIS runtime calls.
+	bli_thread_update_tl();
 
-	return bli_rntm_num_threads( &global_rntm );
+	return bli_rntm_num_threads( &tl_rntm );
 }
 
 bool bli_thread_get_is_parallel( void ) // VK
 {
-  //  THis function return true if parallelism is enabled
-  // either by OMP_NUM_THREADS or BLIS_NUM_THREADS or BLIS_?C_NT parameters
-  // When parallelism is enabled using BLIS_IC_NT or BLIS_JC_NT
-  // rntm->num_threads = -1, because num_threads is still not derived
-  // at the BLAS interface, as a result we end up running BLIS sequentially.
-  // In dgemm_ we called bli_thread_get_num_threads() which returns num_threads from
-  // global_rntm.
-  // Therefore this function is added to check whether manual thread factorization
-  // is enabled
-
-  // We must ensure that global_rntm has been initialized.
+	// This function return true if parallelism is enabled
+	// either by OMP_NUM_THREADS or BLIS_NUM_THREADS or BLIS_?C_NT parameters
+	// When parallelism is enabled using BLIS_IC_NT or BLIS_JC_NT
+	// rntm->num_threads = -1, because num_threads is still not derived
+	// at the BLAS interface, as a result we end up running BLIS sequentially.
+	// In dgemm_ we called bli_thread_get_num_threads() which returns num_threads from
+	// tl_rntm.
+	// Therefore this function is added to check whether manual thread factorization
+	// is enabled.
+	//
+	// Note: This function CANNOT be used inside the parallelism within a
+	// BLIS function, as global_norm may not have correct OpenMP parallelism
+	// information, and initializing new TLS tl_rntm on the threads created
+	// by the parallel region will not get the correct OpenMP information
+	// for this region from the inside.
+	// In other words, this function reports whether parallelism will exist
+	// in a new parallel region.
+
+	// We must ensure that tl_rntm has been initialized and is up-to-date.
 	bli_init_once();
+	bli_thread_update_tl();
 
-  dim_t jc = bli_rntm_jc_ways( &global_rntm );
-  dim_t pc = bli_rntm_pc_ways( &global_rntm );
-  dim_t ic = bli_rntm_ic_ways( &global_rntm );
-  dim_t jr = bli_rntm_jr_ways( &global_rntm );
-  dim_t ir = bli_rntm_ir_ways( &global_rntm );
+	dim_t jc = bli_rntm_jc_ways( &tl_rntm );
+	dim_t pc = bli_rntm_pc_ways( &tl_rntm );
+	dim_t ic = bli_rntm_ic_ways( &tl_rntm );
+	dim_t jr = bli_rntm_jr_ways( &tl_rntm );
+	dim_t ir = bli_rntm_ir_ways( &tl_rntm );
 
-  dim_t nt = bli_rntm_num_threads( &global_rntm );
+	dim_t nt = bli_rntm_num_threads( &tl_rntm );
 
-  if ( nt > 1 || (jc * pc * ic * jr * ir) > 1 ) return 1;
-  return 0; // else
+#ifdef PRINT_THREADING
+	printf( "bli_thread_get_is_parallel(): tl_rntm\n" );
+	bli_rntm_print( &tl_rntm );
+#endif
+
+	if ( nt > 1 || (jc * pc * ic * jr * ir) > 1 ) return 1;
+	return 0; // else
 }
 
 // ----------------------------------------------------------------------------
@@ -1590,13 +1627,30 @@ void bli_thread_set_ways( dim_t jc, dim_t pc, dim_t ic, dim_t jr, dim_t ir )
 	// We must ensure that global_rntm has been initialized.
 	bli_init_once();
 
+	// Update global_rntm so any threads spawned after this call
+	// inherit the values set here.
+
 	// Acquire the mutex protecting global_rntm.
 	bli_pthread_mutex_lock( &global_rntm_mutex );
 
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, &global_rntm );
 
+	// BLIS_NUM_THREADS env variable or BLIS API to set the
+	// number of threads is used. Setting the blis_mt flag to TRUE
+	// so that OMP API or OMP env variables will not be of effect
+	// going forward.
+	bli_rntm_set_blis_mt_only( TRUE, &global_rntm );
+
+	// Unset num_threads value here?
+	//bli_rntm_set_num_threads_only( -1, &global_rntm );
+
 	// Release the mutex protecting global_rntm.
 	bli_pthread_mutex_unlock( &global_rntm_mutex );
+
+#ifdef PRINT_THREADING
+	printf( "bli_thread_set_ways(): global_rntm\n" );
+	bli_rntm_print( &global_rntm );
+#endif
 }
 
 void bli_thread_set_num_threads( dim_t n_threads )
@@ -1609,6 +1663,9 @@ void bli_thread_set_num_threads( dim_t n_threads )
 		n_threads = 1;
 	}
 
+	// Update global_rntm so any threads spawned after this call
+	// inherit the value set here.
+
 	// Acquire the mutex protecting global_rntm.
 	bli_pthread_mutex_lock( &global_rntm_mutex );
 
@@ -1618,10 +1675,18 @@ void bli_thread_set_num_threads( dim_t n_threads )
 	// number of threads is used. Setting the blis_mt flag to TRUE
 	// so that OMP API or OMP env variables will not be of effect
 	// going forward.
-	bli_rntm_set_blis_mt_only(TRUE, &global_rntm);
+	bli_rntm_set_blis_mt_only( TRUE, &global_rntm );
+
+	// Unset ways values here?
+	//bli_rntm_set_ways_only( -1, -1, -1, -1, -1, &global_rntm );
 
 	// Release the mutex protecting global_rntm.
 	bli_pthread_mutex_unlock( &global_rntm_mutex );
+
+#ifdef PRINT_THREADING
+	printf( "bli_thread_set_num_threads(): global_rntm\n" );
+	bli_rntm_print( &global_rntm );
+#endif
 }
 
 // ----------------------------------------------------------------------------
@@ -1631,13 +1696,13 @@ void bli_thread_init_rntm_from_env
        rntm_t* rntm
      )
 {
+	// Initialize global_rntm from environment.
 	// NOTE: We don't need to acquire the global_rntm_mutex here because this
 	// function is only called from bli_thread_init(), which is only called
 	// by bli_init_once().
 
 	bool  auto_factor = FALSE;
-	dim_t nt;
-	dim_t jc, pc, ic, jr, ir;
+	dim_t jc, pc, ic, jr, ir, nt;
 
 #ifdef BLIS_ENABLE_MULTITHREADING
 
@@ -1670,6 +1735,11 @@ void bli_thread_init_rntm_from_env
 	// omp_set_num_threads(nt) is not issued by application,
 	// omp_get_max_threads() API will return the number of the cores in the current context.
 	//
+	// Note: omp_get_max_threads() alone is not sufficient to determine the number of threads
+	//       that should be used for a new parallel region. We must also consider the number
+	//       of active levels of OpenMP parallelism and which level we are at, using APIs
+	//       omp_get_max_active_levels() and omp_get_active_level().
+	//
 	// BLIS will initialize rntm->num_threads with the same value.
 	// However if omp_set_nested is false - BLIS APIs called from parallel threads will run in sequential.
 	// But if nested parallelism is enabled - Then each application will launch MT BLIS.
@@ -1703,7 +1773,16 @@ void bli_thread_init_rntm_from_env
 		bli_rntm_set_blis_mt_only(FALSE, rntm);
 
 #ifdef BLIS_ENABLE_OPENMP
-		nt = omp_get_max_threads();
+		dim_t active_level = omp_get_active_level();
+		dim_t max_levels = omp_get_max_active_levels();
+		if ( active_level < max_levels )
+		{
+		      nt = omp_get_max_threads();
+		} else {
+		      nt = 1;
+		}
+#else
+		nt = 1;
 #endif
 	}
 
@@ -1729,6 +1808,9 @@ void bli_thread_init_rntm_from_env
 
 		// Unset the value for nt.
 		nt = -1;
+
+		// Ensure blis_mt is set to TRUE.
+		bli_rntm_set_blis_mt_only(TRUE, rntm);
 	}
 
 	// By this time, one of the following conditions holds:
@@ -1738,7 +1820,8 @@ void bli_thread_init_rntm_from_env
 
 	// If nt is set (ie: not -1), then we know we will perform an automatic
 	// thread factorization (later, in bli_rntm.c).
-	if ( nt != -1 ) auto_factor = TRUE;
+	// However, there is no need to run auto_factor if nt=1
+	if ( nt > 1 ) auto_factor = TRUE;
 
 #else
 
@@ -1747,16 +1830,271 @@ void bli_thread_init_rntm_from_env
 	nt = -1;
 	jc = pc = ic = jr = ir = 1;
 
+#endif // BLIS_ENABLE_MULTITHREADING
+
+	// Save the results back in the runtime object.
+	bli_rntm_set_auto_factor_only( auto_factor, rntm );
+	bli_rntm_set_num_threads_only( nt, rntm );
+	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+
+#ifdef PRINT_THREADING
+	printf( "bli_thread_init_rntm_from_env(): global_rntm\n" );
+	bli_rntm_print( rntm );
+#endif
+}
+
+void bli_thread_update_rntm_from_env
+     (
+       rntm_t* rntm
+     )
+{	
+	// Update tl_rntm for this user thread from runtime environment and
+	// current status of global_rntm. Must do this every time, in case
+	// global_rntm has been updated by blis-specific threading function calls.
+
+	// NOTE: We don't need to acquire the global_rntm_mutex here because this
+	// function is updating the thread local tl_rntm (not global_rntm).
+
+	bool auto_factor = FALSE;
+	dim_t jc, pc, ic, jr, ir, nt;
+	bool blis_mt;
+
+	// Extract threading data from global_rntm.
+	nt = bli_rntm_num_threads( &global_rntm );
+	jc = bli_rntm_jc_ways( &global_rntm );
+	pc = bli_rntm_pc_ways( &global_rntm );
+	ic = bli_rntm_ic_ways( &global_rntm );
+	jr = bli_rntm_jr_ways( &global_rntm );
+	ir = bli_rntm_ir_ways( &global_rntm );
+	blis_mt = bli_rntm_blis_mt( &global_rntm );
+
+#ifdef BLIS_ENABLE_MULTITHREADING
+
+	// Environment variables BLIS_NUM_THREADS and BLIS_*_NT have been read
+	// by bli_thread_init_rntm_from_env(). Don't incur overhead re-reading
+	// them here.
+
+	// Scenarios:
+	// 1. If BLIS_NUM_THREADS is set with a valid value, same value
+	// will be used in the subsequent parallel regions unless
+	// bli_thread_set_num_threads() API is used by the Application
+	// to modify the desired number of threads during BLIS API execution.
+	//
+	// 2. Once BLIS_NUM_THREADS environment variable or bli_thread_set_num_threads(nt)
+	// API is used by the application, BLIS module would always give precedence to
+	// these values. BLIS API would not consider the values set using OpenMP API
+	// omp_set_num_threads(nt) API or OMP_NUM_THREADS environment variable.
+	//
+	// 3. If Application wants to allocate separate number of threads for BLIS API execution
+	// and application, Application can choose either BLIS_NUM_THREADS environement variable
+	// or bli_thread_set_num_threads(nt) API, to set the desired number of threads
+	// in BLIS API Execution. Application can use OpenMP APIs or environment variables for
+	// itself.
+	//
+	// 4. If BLIS_NUM_THREADS is not set, then if Application is multithreaded and issued
+	// omp_set_num_threads(nt) with desired number of threads,
+	// omp_get_max_threads() API will fetch the number of threads set earlier.
+	//
+	// 5. If BLIS_NUM_THREADS is not set, omp_set_num_threads(nt) is not called by the application,
+	// but only OMP_NUM_THREADS is set,
+	// omp_get_max_threads() API will fetch the value of OMP_NUM_THREADS.
+	//
+	// 6. If both environment variables are not set, or if they are set with invalid values, and
+	// omp_set_num_threads(nt) is not issued by application,
+	// omp_get_max_threads() API will return the number of the cores in the current context.
+	//
+	// Note: omp_get_max_threads() alone is not sufficient to determine the number of threads
+	//       that should be used for a new parallel region. We must also consider the number
+	//       of active levels of OpenMP parallelism and which level we are at, using APIs
+	//       omp_get_max_active_levels() and omp_get_active_level().
+	//
+	// BLIS will initialize rntm->num_threads with the same value.
+	// However if omp_set_nested is false - BLIS APIs called from parallel threads will run in sequential.
+	// But if nested parallelism is enabled - Then each application will launch MT BLIS.
+	//
+	// Order of precedence used for number of threads:
+	// 0. valid value set using bli_thread_set_num_threads(nt) by the application
+	// 1. valid value set for BLIS_NUM_THREADS environment variable
+	// 2. omp_set_num_threads(nt) issued by the application
+	// 3. valid value set for OMP_NUM_THREADS environment variable
+	// 4. Number of cores
+	//
+	// Note: If nt is not a valid value for omp_set_num_threads(nt) API, number of threads would be set to 1.
+	// omp_get_max_threads() API will return 1.
+	//
+	// OMP_NUM_THREADS environment variable is applicable only when OpenMP is enabled.
+
+	if(blis_mt)
+	{
+		// BLIS threading env vars and/or APIs have been used.
+
+		// If any BLIS_*_NT environment variable was set, then we ignore the
+		// value of BLIS_NUM_THREADS or OMP_NUM_THREADS and use the
+		// BLIS_*_NT values instead (with unset variables being treated as if
+		// they contained 1).
+		if ( jc != -1 || pc != -1 || ic != -1 || jr != -1 || ir != -1 )
+		{
+			if ( jc == -1 ) jc = 1;
+			if ( pc == -1 ) pc = 1;
+			if ( ic == -1 ) ic = 1;
+			if ( jr == -1 ) jr = 1;
+			if ( ir == -1 ) ir = 1;
+
+			// Unset the value for nt.
+			nt = -1;
+		}
+
+#ifdef BLIS_ENABLE_OPENMP
+		// If call is not from an active OpenMP level, then it will be
+		// serial irrespective of BLIS threading settings.
+		// Reminder that we are setting values here for tl_rntm, thus
+		// BLIS threading settings remain unchanged in global_rntm for
+		// consideration in future calls.
+		dim_t active_level = omp_get_active_level();
+		dim_t max_levels = omp_get_max_active_levels();
+		if ( active_level >= max_levels )
+		{
+			nt = -1;
+			jc = pc = ic = jr = ir = 1;
+		}
+#endif
+
+	}
+        else
+        {
+
+		// BLIS threading env vars and/or APIs have not been used.
+
+#ifdef BLIS_ENABLE_OPENMP
+		dim_t active_level = omp_get_active_level();
+		dim_t max_levels = omp_get_max_active_levels();
+		if ( active_level < max_levels )
+		{
+		      nt = omp_get_max_threads();
+		} else {
+		      nt = 1;
+		}
+#else
+		nt = 1;
 #endif
+	}
+
+	// By this time, one of the following conditions holds:
+	// - nt is -1 and the ways for each loop are -1.
+	// - nt is -1 and the ways for each loop are all set.
+	// - nt is set and the ways for each loop are -1.
+
+	// If nt is set (ie: not -1), then we know we will perform an automatic
+	// thread factorization (later, in bli_rntm.c).
+	// However, there is no need to run auto_factor if nt=1
+	if ( nt > 1 ) auto_factor = TRUE;
+
+#else
+
+	// When multithreading is disabled, always set the rntm_t ways
+	// values to 1.
+	nt = -1;
+	jc = pc = ic = jr = ir = 1;
+
+#endif // BLIS_ENABLE_MULTITHREADING
 
 	// Save the results back in the runtime object.
 	bli_rntm_set_auto_factor_only( auto_factor, rntm );
 	bli_rntm_set_num_threads_only( nt, rntm );
 	bli_rntm_set_ways_only( jc, pc, ic, jr, ir, rntm );
+	bli_rntm_set_blis_mt_only( blis_mt, rntm );
 
-#if 0
-	printf( "bli_thread_init_rntm_from_env()\n" );
+#ifdef PRINT_THREADING
+	printf( "bli_thread_update_rntm_from_env(): tl_rntm\n" );
 	bli_rntm_print( rntm );
 #endif
 }
 
+/*
+	Functionality:
+	--------------
+	This function calculated the amount of work the calling thread is supposed
+	to perform on a vector.
+
+	Function signature
+	-------------------
+
+	This function takes the following input:
+
+	* n_elem - Number of element in the vector
+	* t_count - Number of threads in the group
+	* start - Vector start index (where the thread should start its processing)
+	* compute_len - Size of the chunk it needs to process
+	* thread_id - ID of the thread
+
+	Exception
+	----------
+
+	None
+*/
+void bli_thread_vector_partition
+     (
+       dim_t 	n_elem,
+       dim_t	t_count,
+       dim_t* 	start,
+       dim_t* 	compute_len,
+       dim_t 	thread_id
+     )
+{
+	dim_t thread_min_work = n_elem / t_count;
+	dim_t remainder_work = n_elem % t_count;
+
+	// In this case the length of the vector will be remainder_work
+	if (thread_min_work == 0)
+	{
+		/*
+			Threads with ID less than the length of the vector will
+			perform of the compute while the other threads will be idle
+		*/
+		if (thread_id < remainder_work)
+		{
+			  *start = thread_id;
+			  *compute_len = 1;
+		}
+		else
+		{
+			  *start = 0;
+			  *compute_len = 0;
+		}
+	}
+	else
+	{
+		if ( remainder_work == 0 )
+		{
+			*start = thread_min_work * thread_id;
+			*compute_len = thread_min_work;
+		}
+		else
+		{
+			/*
+				Scenario
+				--------
+
+				10 elements, 4 threads
+
+				Thread 0 - start = 0, compute_len = 2
+				Thread 1 - start = 2, compute_len = 2
+				Thread 2 - start = 4, compute_len = 3
+				Thread 3 - start = 7, compute_len = 3
+			*/
+			dim_t additional_work = t_count - remainder_work;
+
+			if (thread_id >= additional_work)
+			{
+				*start = (thread_min_work * thread_id) +
+						 (thread_id - (t_count - remainder_work));
+				*compute_len = thread_min_work + 1;
+			}
+			else
+			{
+				*start = thread_min_work * thread_id;
+				*compute_len = thread_min_work;
+			}
+		}
+	}
+}
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 2802676c26..00ae53dff4 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -6,7 +6,7 @@
 
    Copyright (C) 2014, The University of Texas at Austin
    Copyright (C) 2016, Hewlett Packard Enterprise Development LP
-   Copyright (C) 2018 - 21, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 23, Advanced Micro Devices, Inc. All rights reserved.
    
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -59,7 +59,9 @@
 
 // Initialization-related prototypes.
 void bli_thread_init( void );
+void bli_thread_update_tl( void );
 void bli_thread_finalize( void );
+void bli_thread_finalize_tl( void );
 
 // Thread range-related prototypes.
 
@@ -188,6 +190,14 @@ void bli_thread_partition_2x2_fast
        dim_t* restrict nt2
      );
 
+void bli_thread_vector_partition
+     (
+       dim_t  n_elem,
+       dim_t  t_count,
+       dim_t* start,
+       dim_t* compute_len,
+       dim_t  thread_id
+     );
 
 // -----------------------------------------------------------------------------
 
@@ -211,6 +221,8 @@ BLIS_EXPORT_BLIS void  bli_thread_set_num_threads( dim_t value );
 
 BLIS_EXPORT_BLIS void  bli_thread_init_rntm_from_env( rntm_t* rntm );
 
+BLIS_EXPORT_BLIS void  bli_thread_update_rntm_from_env( rntm_t* rntm );
+
 // -----------------------------------------------------------------------------
 
 BLIS_INLINE void bli_thread_range_jrir_rr
diff --git a/frame/util/bli_util.h b/frame/util/bli_util.h
index f7be273526..37bb9f92c7 100644
--- a/frame/util/bli_util.h
+++ b/frame/util/bli_util.h
@@ -65,4 +65,4 @@
 #include "bli_util_api_wrap.h"
 
 // Public interface for the progress feature
-#include "bli_util_progress.h"
\ No newline at end of file
+#include "bli_util_progress.h"
diff --git a/frame/util/bli_util_api_wrap.c b/frame/util/bli_util_api_wrap.c
index 81300761fb..21ccd2fb1c 100644
--- a/frame/util/bli_util_api_wrap.c
+++ b/frame/util/bli_util_api_wrap.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2021, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -38,2391 +38,2412 @@
 #include "blis.h"
 #include "bli_util_api_wrap.h"
 
+#ifdef BLIS_ENABLE_BLAS
+
 // wrapper functions to support additional symbols
 #ifndef BLIS_ENABLE_NO_UNDERSCORE_API
 #ifndef BLIS_ENABLE_UPPERCASE_API
 void CAXPY(const f77_int *n,const scomplex  *ca,const scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    caxpy_( n, ca, cx, incx, cy, incy);
+    caxpy_blis_impl( n, ca, cx, incx, cy, incy);
 }
 
 void caxpy(const f77_int *n,const scomplex  *ca,const scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    caxpy_( n, ca, cx, incx, cy, incy);
+    caxpy_blis_impl( n, ca, cx, incx, cy, incy);
 }
 
 void CAXPY_(const f77_int *n,const scomplex  *ca,const scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    caxpy_( n, ca, cx, incx, cy, incy);
+    caxpy_blis_impl( n, ca, cx, incx, cy, incy);
 }
 
 void CCOPY(const f77_int *n,const scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    ccopy_( n, cx, incx, cy, incy);
+    ccopy_blis_impl( n, cx, incx, cy, incy);
 }
 
 void ccopy(const f77_int *n,const scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    ccopy_( n, cx, incx, cy, incy);
+    ccopy_blis_impl( n, cx, incx, cy, incy);
 }
 
 void CCOPY_(const f77_int *n,const scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    ccopy_( n, cx, incx, cy, incy);
+    ccopy_blis_impl( n, cx, incx, cy, incy);
 }
 
 #ifdef BLIS_DISABLE_COMPLEX_RETURN_INTEL
 scomplex CDOTC(const f77_int* n,const scomplex*   x, const f77_int* incx,const scomplex*   y, const f77_int* incy)
 {
-    return cdotc_ ( n, x, incx, y, incy);
+    return cdotc_blis_impl ( n, x, incx, y, incy);
 }
 
 scomplex cdotc(const f77_int* n,const scomplex*   x, const f77_int* incx,const scomplex*   y, const f77_int* incy)
 {
-    return cdotc_ ( n, x, incx, y, incy);
+    return cdotc_blis_impl ( n, x, incx, y, incy);
 }
 
 scomplex CDOTC_(const f77_int* n,const scomplex*   x, const f77_int* incx,const scomplex*   y, const f77_int* incy)
 {
-    return cdotc_ ( n, x, incx, y, incy);
+    return cdotc_blis_impl ( n, x, incx, y, incy);
 }
 
 scomplex CDOTU(const f77_int* n,const scomplex*   x, const f77_int* incx,const scomplex*   y, const f77_int* incy)
 {
-    return cdotu_ ( n, x, incx, y, incy);
+    return cdotu_blis_impl ( n, x, incx, y, incy);
 }
 
 scomplex cdotu(const f77_int* n,const scomplex*   x, const f77_int* incx,const scomplex*   y, const f77_int* incy)
 {
-    return cdotu_ ( n, x, incx, y, incy);
+    return cdotu_blis_impl ( n, x, incx, y, incy);
 }
 
 scomplex CDOTU_(const f77_int* n,const scomplex*   x, const f77_int* incx,const scomplex*   y, const f77_int* incy)
 {
-    return cdotu_ ( n, x, incx, y, incy);
+    return cdotu_blis_impl ( n, x, incx, y, incy);
 }
 
 dcomplex ZDOTC(const f77_int* n, const dcomplex*   x, const f77_int* incx, const dcomplex*   y, const f77_int* incy)
 {
-    return zdotc_ ( n, x, incx, y, incy);
+    return zdotc_blis_impl ( n, x, incx, y, incy);
 }
 
 dcomplex zdotc(const f77_int* n, const dcomplex*   x, const f77_int* incx, const dcomplex*   y, const f77_int* incy)
 {
-    return zdotc_ ( n, x, incx, y, incy);
+    return zdotc_blis_impl ( n, x, incx, y, incy);
 }
 
 dcomplex ZDOTC_(const f77_int* n, const dcomplex*   x, const f77_int* incx, const dcomplex*   y, const f77_int* incy)
 {
-    return zdotc_ ( n, x, incx, y, incy);
+    return zdotc_blis_impl ( n, x, incx, y, incy);
 }
 
 dcomplex ZDOTU (const f77_int* n, const dcomplex*   x, const f77_int* incx, const dcomplex*   y, const f77_int* incy)
 {
-    return zdotu_ ( n, x, incx, y, incy);
+    return zdotu_blis_impl ( n, x, incx, y, incy);
 }
 
 dcomplex zdotu (const f77_int* n, const dcomplex*   x, const f77_int* incx, const dcomplex*   y, const f77_int* incy)
 {
-    return zdotu_ ( n, x, incx, y, incy);
+    return zdotu_blis_impl ( n, x, incx, y, incy);
 }
 
 dcomplex ZDOTU_(const f77_int* n, const dcomplex*   x, const f77_int* incx, const dcomplex*   y, const f77_int* incy)
 {
-    return zdotu_ ( n, x, incx, y, incy);
+    return zdotu_blis_impl ( n, x, incx, y, incy);
 }
 #else
 void CDOTC(scomplex* retval,const f77_int *n, const scomplex  *cx, const f77_int *incx, const scomplex  *cy, const f77_int *incy)
 {
-    cdotc_( retval, n, cx, incx, cy, incy);
+    cdotc_blis_impl( retval, n, cx, incx, cy, incy);
 }
 
 void cdotc(scomplex* retval,const f77_int *n, const scomplex  *cx, const f77_int *incx, const scomplex  *cy, const f77_int *incy)
 {
-    cdotc_( retval, n, cx, incx, cy, incy);
+    cdotc_blis_impl( retval, n, cx, incx, cy, incy);
 }
 
 void CDOTC_(scomplex* retval,const f77_int *n, const scomplex  *cx, const f77_int *incx, const scomplex  *cy, const f77_int *incy)
 {
-    cdotc_( retval, n, cx, incx, cy, incy);
+    cdotc_blis_impl( retval, n, cx, incx, cy, incy);
 }
 
 void CDOTU(scomplex* retval,const f77_int *n, const scomplex  *cx, const f77_int *incx, const scomplex  *cy, const f77_int *incy)
 {
-    cdotu_( retval, n, cx, incx, cy, incy);
+    cdotu_blis_impl( retval, n, cx, incx, cy, incy);
 }
 
 void cdotu(scomplex* retval,const f77_int *n, const scomplex  *cx, const f77_int *incx, const scomplex  *cy, const f77_int *incy)
 {
-    cdotu_( retval, n, cx, incx, cy, incy);
+    cdotu_blis_impl( retval, n, cx, incx, cy, incy);
 }
 
 void CDOTU_(scomplex* retval,const f77_int *n, const scomplex  *cx, const f77_int *incx, const scomplex  *cy, const f77_int *incy)
 {
-    cdotu_( retval, n, cx, incx, cy, incy);
+    cdotu_blis_impl( retval, n, cx, incx, cy, incy);
 }
 
 void ZDOTC(dcomplex* retval,const f77_int *n, const dcomplex *zx, const f77_int *incx, const dcomplex *zy, const f77_int *incy)
 {
-    zdotc_( retval, n, zx, incx, zy, incy);
+    zdotc_blis_impl( retval, n, zx, incx, zy, incy);
 }
 
 void zdotc(dcomplex* retval,const f77_int *n, const dcomplex *zx, const f77_int *incx, const dcomplex *zy, const f77_int *incy)
 {
-    zdotc_( retval, n, zx, incx, zy, incy);
+    zdotc_blis_impl( retval, n, zx, incx, zy, incy);
 }
 
 void ZDOTC_(dcomplex* retval,const f77_int *n, const dcomplex *zx, const f77_int *incx, const dcomplex *zy, const f77_int *incy)
 {
-    zdotc_( retval, n, zx, incx, zy, incy);
+    zdotc_blis_impl( retval, n, zx, incx, zy, incy);
 }
 
 void ZDOTU(dcomplex* retval,const f77_int *n, const dcomplex *zx, const f77_int *incx, const dcomplex *zy, const f77_int *incy)
 {
-    zdotu_( retval, n, zx, incx, zy, incy);
+    zdotu_blis_impl( retval, n, zx, incx, zy, incy);
 }
 
 void zdotu(dcomplex* retval,const f77_int *n, const dcomplex *zx, const f77_int *incx, const dcomplex *zy, const f77_int *incy)
 {
-    zdotu_( retval, n, zx, incx, zy, incy);
+    zdotu_blis_impl( retval, n, zx, incx, zy, incy);
 }
 
 void ZDOTU_(dcomplex* retval,const f77_int *n, const dcomplex *zx, const f77_int *incx, const dcomplex *zy, const f77_int *incy)
 {
-    zdotu_( retval, n, zx, incx, zy, incy);
+    zdotu_blis_impl( retval, n, zx, incx, zy, incy);
 }
 #endif
 
 void CGBMV(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    cgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    cgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void cgbmv(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    cgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    cgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void CGBMV_(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    cgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    cgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void CGEMM(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void cgemm(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CGEMM_(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CGEMV(const char   *trans,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    cgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    cgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void cgemv(const char   *trans,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    cgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    cgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void CGEMV_(const char   *trans,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    cgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    cgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void CGERC(const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cgerc_( m, n, alpha, x, incx, y, incy, a, lda);
+    cgerc_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void cgerc(const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cgerc_( m, n, alpha, x, incx, y, incy, a, lda);
+    cgerc_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void CGERC_(const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cgerc_( m, n, alpha, x, incx, y, incy, a, lda);
+    cgerc_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void CGERU(const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cgeru_( m, n, alpha, x, incx, y, incy, a, lda);
+    cgeru_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void cgeru(const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cgeru_( m, n, alpha, x, incx, y, incy, a, lda);
+    cgeru_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void CGERU_(const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cgeru_( m, n, alpha, x, incx, y, incy, a, lda);
+    cgeru_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void CHBMV(const char   *uplo,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    chbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void chbmv(const char   *uplo,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    chbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void CHBMV_(const char   *uplo,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    chbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void CHEMM(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    chemm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    chemm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void chemm(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    chemm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    chemm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CHEMM_(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    chemm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    chemm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CHEMV(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chemv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    chemv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void chemv(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chemv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    chemv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void CHEMV_(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chemv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    chemv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void CHER(const char   *uplo,const f77_int *n,const float  *alpha,const scomplex  *x,const f77_int *incx,scomplex  *a,const f77_int *lda)
 {
-    cher_( uplo, n, alpha, x, incx, a, lda);
+    cher_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void cher(const char   *uplo,const f77_int *n,const float  *alpha,const scomplex  *x,const f77_int *incx,scomplex  *a,const f77_int *lda)
 {
-    cher_( uplo, n, alpha, x, incx, a, lda);
+    cher_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void CHER_(const char   *uplo,const f77_int *n,const float  *alpha,const scomplex  *x,const f77_int *incx,scomplex  *a,const f77_int *lda)
 {
-    cher_( uplo, n, alpha, x, incx, a, lda);
+    cher_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void CHER2(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cher2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    cher2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void cher2(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cher2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    cher2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void CHER2_(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *a,const f77_int *lda)
 {
-    cher2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    cher2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void CHER2K(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const float  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cher2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cher2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void cher2k(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const float  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cher2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cher2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CHER2K_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const float  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cher2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cher2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CHERK(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const scomplex  *a,const f77_int *lda,const float  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cherk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    cherk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void cherk(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const scomplex  *a,const f77_int *lda,const float  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cherk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    cherk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void CHERK_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const scomplex  *a,const f77_int *lda,const float  *beta,scomplex  *c,const f77_int *ldc)
 {
-    cherk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    cherk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void CHPMV(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *ap,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chpmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    chpmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void chpmv(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *ap,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chpmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    chpmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void CHPMV_(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *ap,const scomplex  *x,const f77_int *incx,const scomplex  *beta,scomplex  *y,const f77_int *incy)
 {
-    chpmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    chpmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void CHPR(const char   *uplo,const f77_int *n,const float  *alpha,const scomplex  *x,const f77_int *incx,scomplex  *ap)
 {
-    chpr_( uplo, n, alpha, x, incx, ap);
+    chpr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void chpr(const char   *uplo,const f77_int *n,const float  *alpha,const scomplex  *x,const f77_int *incx,scomplex  *ap)
 {
-    chpr_( uplo, n, alpha, x, incx, ap);
+    chpr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void CHPR_(const char   *uplo,const f77_int *n,const float  *alpha,const scomplex  *x,const f77_int *incx,scomplex  *ap)
 {
-    chpr_( uplo, n, alpha, x, incx, ap);
+    chpr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void CHPR2(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *ap)
 {
-    chpr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    chpr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void chpr2(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *ap)
 {
-    chpr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    chpr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void CHPR2_(const char   *uplo,const f77_int *n,const scomplex  *alpha,const scomplex  *x,const f77_int *incx,const scomplex  *y,const f77_int *incy,scomplex  *ap)
 {
-    chpr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    chpr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void CROTG(scomplex  *ca, bla_scomplex  *cb, bla_real  *c,scomplex  *s)
 {
-    crotg_( ca, cb, c, s);
+    crotg_blis_impl( ca, cb, c, s);
 }
 
 void crotg(scomplex  *ca, bla_scomplex  *cb, bla_real  *c,scomplex  *s)
 {
-    crotg_( ca, cb, c, s);
+    crotg_blis_impl( ca, cb, c, s);
 }
 
 void CROTG_(scomplex  *ca, bla_scomplex  *cb, bla_real  *c,scomplex  *s)
 {
-    crotg_( ca, cb, c, s);
+    crotg_blis_impl( ca, cb, c, s);
 }
 
 void CSCAL(const f77_int *n,const scomplex  *ca,scomplex  *cx,const f77_int *incx)
 {
-    cscal_( n, ca, cx, incx);
+    cscal_blis_impl( n, ca, cx, incx);
 }
 
 void cscal(const f77_int *n,const scomplex  *ca,scomplex  *cx,const f77_int *incx)
 {
-    cscal_( n, ca, cx, incx);
+    cscal_blis_impl( n, ca, cx, incx);
 }
 
 void CSCAL_(const f77_int *n,const scomplex  *ca,scomplex  *cx,const f77_int *incx)
 {
-    cscal_( n, ca, cx, incx);
+    cscal_blis_impl( n, ca, cx, incx);
 }
 
 void CSROT(const f77_int *n,scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy,const float  *c,const float  *s)
 {
-    csrot_( n, cx, incx, cy, incy, c, s);
+    csrot_blis_impl( n, cx, incx, cy, incy, c, s);
 }
 
 void csrot(const f77_int *n,scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy,const float  *c,const float  *s)
 {
-    csrot_( n, cx, incx, cy, incy, c, s);
+    csrot_blis_impl( n, cx, incx, cy, incy, c, s);
 }
 
 void CSROT_(const f77_int *n,scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy,const float  *c,const float  *s)
 {
-    csrot_( n, cx, incx, cy, incy, c, s);
+    csrot_blis_impl( n, cx, incx, cy, incy, c, s);
 }
 
 void CSSCAL(const f77_int *n,const float  *sa,scomplex  *cx,const f77_int *incx)
 {
-    csscal_( n, sa, cx, incx);
+    csscal_blis_impl( n, sa, cx, incx);
 }
 
 void csscal(const f77_int *n,const float  *sa,scomplex  *cx,const f77_int *incx)
 {
-    csscal_( n, sa, cx, incx);
+    csscal_blis_impl( n, sa, cx, incx);
 }
 
 void CSSCAL_(const f77_int *n,const float  *sa,scomplex  *cx,const f77_int *incx)
 {
-    csscal_( n, sa, cx, incx);
+    csscal_blis_impl( n, sa, cx, incx);
 }
 
 void CSWAP(const f77_int *n,scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    cswap_( n, cx, incx, cy, incy);
+    cswap_blis_impl( n, cx, incx, cy, incy);
 }
 
 void cswap(const f77_int *n,scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    cswap_( n, cx, incx, cy, incy);
+    cswap_blis_impl( n, cx, incx, cy, incy);
 }
 
 void CSWAP_(const f77_int *n,scomplex  *cx,const f77_int *incx,scomplex  *cy,const f77_int *incy)
 {
-    cswap_( n, cx, incx, cy, incy);
+    cswap_blis_impl( n, cx, incx, cy, incy);
 }
 
 void CSYMM(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    csymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void csymm(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    csymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CSYMM_(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    csymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CSYR2K(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    csyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void csyr2k(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    csyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CSYR2K_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *b,const f77_int *ldb,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    csyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CSYRK(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    csyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void csyrk(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    csyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void CSYRK_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,const scomplex  *beta,scomplex  *c,const f77_int *ldc)
 {
-    csyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    csyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void CTBMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ctbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void ctbmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ctbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void CTBMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ctbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void CTBSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ctbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void ctbsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ctbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void CTBSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ctbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void CTPMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *ap,scomplex  *x,const f77_int *incx)
 {
-    ctpmv_( uplo, trans, diag, n, ap, x, incx);
+    ctpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void ctpmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *ap,scomplex  *x,const f77_int *incx)
 {
-    ctpmv_( uplo, trans, diag, n, ap, x, incx);
+    ctpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void CTPMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *ap,scomplex  *x,const f77_int *incx)
 {
-    ctpmv_( uplo, trans, diag, n, ap, x, incx);
+    ctpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void CTPSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *ap,scomplex  *x,const f77_int *incx)
 {
-    ctpsv_( uplo, trans, diag, n, ap, x, incx);
+    ctpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void ctpsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *ap,scomplex  *x,const f77_int *incx)
 {
-    ctpsv_( uplo, trans, diag, n, ap, x, incx);
+    ctpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void CTPSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *ap,scomplex  *x,const f77_int *incx)
 {
-    ctpsv_( uplo, trans, diag, n, ap, x, incx);
+    ctpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void CTRMM(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,scomplex  *b,const f77_int *ldb)
 {
-    ctrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ctrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void ctrmm(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,scomplex  *b,const f77_int *ldb)
 {
-    ctrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ctrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void CTRMM_(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,scomplex  *b,const f77_int *ldb)
 {
-    ctrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ctrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void CTRMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    ctrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void ctrmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    ctrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void CTRMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *a,const f77_int *lda,scomplex  *x,const f77_int *incx)
 {
-    ctrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    ctrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void CTRSM(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,scomplex  *b,const f77_int *ldb)
 {
-    ctrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ctrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void ctrsm(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,scomplex  *b,const f77_int *ldb)
 {
-    ctrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ctrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void CTRSM_(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const scomplex  *alpha,const scomplex  *a,const f77_int *lda,scomplex  *b,const f77_int *ldb)
 {
-    ctrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ctrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void CTRSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex  *a,const f77_int *lda,scomplex *x,const f77_int *incx)
 {
-    ctrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    ctrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void ctrsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex *a,const f77_int *lda,scomplex *x,const f77_int *incx)
 {
-    ctrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    ctrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void CTRSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const scomplex *a,const f77_int *lda,scomplex *x,const f77_int *incx)
 {
-    ctrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    ctrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 double DASUM(const f77_int *n,const double *dx,const f77_int *incx)
 {
-    return dasum_( n, dx, incx);
+    return dasum_blis_impl( n, dx, incx);
 }
 
 double dasum(const f77_int *n,const double *dx,const f77_int *incx)
 {
-    return dasum_( n, dx, incx);
+    return dasum_blis_impl( n, dx, incx);
 }
 
 double DASUM_(const f77_int *n,const double *dx,const f77_int *incx)
 {
-    return dasum_( n, dx, incx);
+    return dasum_blis_impl( n, dx, incx);
 }
 
 void DAXPY(const f77_int *n,const double *da,const double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    daxpy_( n, da, dx, incx, dy, incy);
+    daxpy_blis_impl( n, da, dx, incx, dy, incy);
 }
 
 void daxpy(const f77_int *n,const double *da,const double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    daxpy_( n, da, dx, incx, dy, incy);
+    daxpy_blis_impl( n, da, dx, incx, dy, incy);
 }
 
 void DAXPY_(const f77_int *n,const double *da,const double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    daxpy_( n, da, dx, incx, dy, incy);
+    daxpy_blis_impl( n, da, dx, incx, dy, incy);
 }
 
 double DCABS1(bla_dcomplex *z)
 {
-    return dcabs1_( z);
+    return dcabs1_blis_impl( z);
 }
 
 double dcabs1(bla_dcomplex *z)
 {
-    return dcabs1_( z);
+    return dcabs1_blis_impl( z);
 }
 
 double DCABS1_(bla_dcomplex *z)
 {
-    return dcabs1_( z);
+    return dcabs1_blis_impl( z);
 }
 
 void DCOPY(const f77_int *n,const double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    dcopy_( n, dx, incx, dy, incy);
+    dcopy_blis_impl( n, dx, incx, dy, incy);
 }
 
 void dcopy(const f77_int *n,const double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    dcopy_( n, dx, incx, dy, incy);
+    dcopy_blis_impl( n, dx, incx, dy, incy);
 }
 
 void DCOPY_(const f77_int *n,const double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    dcopy_( n, dx, incx, dy, incy);
+    dcopy_blis_impl( n, dx, incx, dy, incy);
 }
 
 double DDOT(const f77_int *n,const double *dx,const f77_int *incx,const double *dy,const f77_int *incy)
 {
-    return ddot_( n, dx, incx, dy, incy);
+    return ddot_blis_impl( n, dx, incx, dy, incy);
 }
 
 double ddot(const f77_int *n,const double *dx,const f77_int *incx,const double *dy,const f77_int *incy)
 {
-    return ddot_( n, dx, incx, dy, incy);
+    return ddot_blis_impl( n, dx, incx, dy, incy);
 }
 
 double DDOT_(const f77_int *n,const double *dx,const f77_int *incx,const double *dy,const f77_int *incy)
 {
-    return ddot_( n, dx, incx, dy, incy);
+    return ddot_blis_impl( n, dx, incx, dy, incy);
 }
 
 void DGBMV(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    dgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void dgbmv(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    dgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void DGBMV_(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    dgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void DGEMM(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void dgemm(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void DGEMM_(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+
+void DZGEMM( const f77_char *transa, const f77_char *transb, const f77_int *m, const f77_int *n, const f77_int *k, const dcomplex *alpha, const double *a, const f77_int *lda, const dcomplex *b, const f77_int *ldb, const dcomplex *beta, dcomplex *c, const f77_int *ldc )
+{
+    dzgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
+
+void dzgemm( const f77_char *transa, const f77_char *transb, const f77_int *m, const f77_int *n, const f77_int *k, const dcomplex *alpha, const double *a, const f77_int *lda, const dcomplex *b, const f77_int *ldb, const dcomplex *beta, dcomplex *c, const f77_int *ldc )
+{
+    dzgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+void DZGEMM_( const f77_char *transa, const f77_char *transb, const f77_int *m, const f77_int *n, const f77_int *k, const dcomplex *alpha, const double *a, const f77_int *lda, const dcomplex *b, const f77_int *ldb, const dcomplex *beta, dcomplex *c, const f77_int *ldc )
+{
+    dzgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+}
 void DGEMV(const char   *trans,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    dgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void dgemv(const char   *trans,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    dgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void DGEMV_(const char   *trans,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    dgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void DGER(const f77_int *m,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *a,const f77_int *lda)
 {
-    dger_( m, n, alpha, x, incx, y, incy, a, lda);
+    dger_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void dger(const f77_int *m,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *a,const f77_int *lda)
 {
-    dger_( m, n, alpha, x, incx, y, incy, a, lda);
+    dger_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void DGER_(const f77_int *m,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *a,const f77_int *lda)
 {
-    dger_( m, n, alpha, x, incx, y, incy, a, lda);
+    dger_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 double DNRM2(const f77_int *n,const double *x,const f77_int *incx)
 {
-    return dnrm2_( n, x, incx);
+    return dnrm2_blis_impl( n, x, incx);
 }
 
 double dnrm2(const f77_int *n,const double *x,const f77_int *incx)
 {
-    return dnrm2_( n, x, incx);
+    return dnrm2_blis_impl( n, x, incx);
 }
 
 double DNRM2_(const f77_int *n,const double *x,const f77_int *incx)
 {
-    return dnrm2_( n, x, incx);
+    return dnrm2_blis_impl( n, x, incx);
 }
 
 void DROT(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy,const double *c,const double *s)
 {
-    drot_( n, dx, incx, dy, incy, c, s);
+    drot_blis_impl( n, dx, incx, dy, incy, c, s);
 }
 
 void drot(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy,const double *c,const double *s)
 {
-    drot_( n, dx, incx, dy, incy, c, s);
+    drot_blis_impl( n, dx, incx, dy, incy, c, s);
 }
 
 void DROT_(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy,const double *c,const double *s)
 {
-    drot_( n, dx, incx, dy, incy, c, s);
+    drot_blis_impl( n, dx, incx, dy, incy, c, s);
 }
 
 void DROTG(double *da,double *db,double *c,double *s)
 {
-    drotg_( da, db, c, s);
+    drotg_blis_impl( da, db, c, s);
 }
 
 void drotg(double *da,double *db,double *c,double *s)
 {
-    drotg_( da, db, c, s);
+    drotg_blis_impl( da, db, c, s);
 }
 
 void DROTG_(double *da,double *db,double *c,double *s)
 {
-    drotg_( da, db, c, s);
+    drotg_blis_impl( da, db, c, s);
 }
 
 void DROTM(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy,const double *dparam)
 {
-    drotm_( n, dx, incx, dy, incy, dparam);
+    drotm_blis_impl( n, dx, incx, dy, incy, dparam);
 }
 
 void drotm(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy,const double *dparam)
 {
-    drotm_( n, dx, incx, dy, incy, dparam);
+    drotm_blis_impl( n, dx, incx, dy, incy, dparam);
 }
 
 void DROTM_(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy,const double *dparam)
 {
-    drotm_( n, dx, incx, dy, incy, dparam);
+    drotm_blis_impl( n, dx, incx, dy, incy, dparam);
 }
 
 void DROTMG(double *dd1,double *dd2,double *dx1,const double *dy1,double *dparam)
 {
-    drotmg_( dd1, dd2, dx1, dy1, dparam);
+    drotmg_blis_impl( dd1, dd2, dx1, dy1, dparam);
 }
 
 void drotmg(double *dd1,double *dd2,double *dx1,const double *dy1,double *dparam)
 {
-    drotmg_( dd1, dd2, dx1, dy1, dparam);
+    drotmg_blis_impl( dd1, dd2, dx1, dy1, dparam);
 }
 
 void DROTMG_(double *dd1,double *dd2,double *dx1,const double *dy1,double *dparam)
 {
-    drotmg_( dd1, dd2, dx1, dy1, dparam);
+    drotmg_blis_impl( dd1, dd2, dx1, dy1, dparam);
 }
 
 void DSBMV(const char   *uplo,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dsbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    dsbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void dsbmv(const char   *uplo,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dsbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    dsbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void DSBMV_(const char   *uplo,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dsbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    dsbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void DSCAL(const f77_int *n,const double *da,double *dx,const f77_int *incx)
 {
-    dscal_( n, da, dx, incx);
+    dscal_blis_impl( n, da, dx, incx);
 }
 
 void dscal(const f77_int *n,const double *da,double *dx,const f77_int *incx)
 {
-    dscal_( n, da, dx, incx);
+    dscal_blis_impl( n, da, dx, incx);
 }
 
 void DSCAL_(const f77_int *n,const double *da,double *dx,const f77_int *incx)
 {
-    dscal_( n, da, dx, incx);
+    dscal_blis_impl( n, da, dx, incx);
 }
 
 double DSDOT(const f77_int *n,const float  *sx,const f77_int *incx,const float  *sy,const f77_int *incy)
 {
-    return dsdot_( n, sx, incx, sy, incy);
+    return dsdot_blis_impl( n, sx, incx, sy, incy);
 }
 
 double dsdot(const f77_int *n,const float  *sx,const f77_int *incx,const float  *sy,const f77_int *incy)
 {
-    return dsdot_( n, sx, incx, sy, incy);
+    return dsdot_blis_impl( n, sx, incx, sy, incy);
 }
 
 double DSDOT_(const f77_int *n,const float  *sx,const f77_int *incx,const float  *sy,const f77_int *incy)
 {
-    return dsdot_( n, sx, incx, sy, incy);
+    return dsdot_blis_impl( n, sx, incx, sy, incy);
 }
 
 void DSPMV(const char   *uplo,const f77_int *n,const double *alpha,const double *ap,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dspmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    dspmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void dspmv(const char   *uplo,const f77_int *n,const double *alpha,const double *ap,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dspmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    dspmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void DSPMV_(const char   *uplo,const f77_int *n,const double *alpha,const double *ap,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dspmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    dspmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void DSPR(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,double *ap)
 {
-    dspr_( uplo, n, alpha, x, incx, ap);
+    dspr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void dspr(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,double *ap)
 {
-    dspr_( uplo, n, alpha, x, incx, ap);
+    dspr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void DSPR_(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,double *ap)
 {
-    dspr_( uplo, n, alpha, x, incx, ap);
+    dspr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void DSPR2(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *ap)
 {
-    dspr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    dspr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void dspr2(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *ap)
 {
-    dspr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    dspr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void DSPR2_(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *ap)
 {
-    dspr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    dspr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void DSWAP(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    dswap_( n, dx, incx, dy, incy);
+    dswap_blis_impl( n, dx, incx, dy, incy);
 }
 
 void dswap(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    dswap_( n, dx, incx, dy, incy);
+    dswap_blis_impl( n, dx, incx, dy, incy);
 }
 
 void DSWAP_(const f77_int *n,double *dx,const f77_int *incx,double *dy,const f77_int *incy)
 {
-    dswap_( n, dx, incx, dy, incy);
+    dswap_blis_impl( n, dx, incx, dy, incy);
 }
 
 void DSYMM(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dsymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    dsymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void dsymm(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dsymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    dsymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void DSYMM_(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dsymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    dsymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void DSYMV(const char   *uplo,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dsymv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    dsymv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void dsymv(const char   *uplo,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dsymv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    dsymv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void DSYMV_(const char   *uplo,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,const double *x,const f77_int *incx,const double *beta,double *y,const f77_int *incy)
 {
-    dsymv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    dsymv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void DSYR(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,double *a,const f77_int *lda)
 {
-    dsyr_( uplo, n, alpha, x, incx, a, lda);
+    dsyr_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void dsyr(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,double *a,const f77_int *lda)
 {
-    dsyr_( uplo, n, alpha, x, incx, a, lda);
+    dsyr_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void DSYR_(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,double *a,const f77_int *lda)
 {
-    dsyr_( uplo, n, alpha, x, incx, a, lda);
+    dsyr_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void DSYR2(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *a,const f77_int *lda)
 {
-    dsyr2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    dsyr2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void dsyr2(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *a,const f77_int *lda)
 {
-    dsyr2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    dsyr2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void DSYR2_(const char   *uplo,const f77_int *n,const double *alpha,const double *x,const f77_int *incx,const double *y,const f77_int *incy,double *a,const f77_int *lda)
 {
-    dsyr2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    dsyr2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void DSYR2K(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dsyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dsyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void dsyr2k(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dsyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dsyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void DSYR2K_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *b,const f77_int *ldb,const double *beta,double *c,const f77_int *ldc)
 {
-    dsyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dsyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void DSYRK(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *beta,double *c,const f77_int *ldc)
 {
-    dsyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    dsyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void dsyrk(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *beta,double *c,const f77_int *ldc)
 {
-    dsyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    dsyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void DSYRK_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const double *a,const f77_int *lda,const double *beta,double *c,const f77_int *ldc)
 {
-    dsyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    dsyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void DTBMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    dtbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void dtbmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    dtbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void DTBMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    dtbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void DTBSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    dtbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void dtbsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    dtbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void DTBSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    dtbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void DTPMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *ap,double *x,const f77_int *incx)
 {
-    dtpmv_( uplo, trans, diag, n, ap, x, incx);
+    dtpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void dtpmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *ap,double *x,const f77_int *incx)
 {
-    dtpmv_( uplo, trans, diag, n, ap, x, incx);
+    dtpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void DTPMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *ap,double *x,const f77_int *incx)
 {
-    dtpmv_( uplo, trans, diag, n, ap, x, incx);
+    dtpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void DTPSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *ap,double *x,const f77_int *incx)
 {
-    dtpsv_( uplo, trans, diag, n, ap, x, incx);
+    dtpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void dtpsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *ap,double *x,const f77_int *incx)
 {
-    dtpsv_( uplo, trans, diag, n, ap, x, incx);
+    dtpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void DTPSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *ap,double *x,const f77_int *incx)
 {
-    dtpsv_( uplo, trans, diag, n, ap, x, incx);
+    dtpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void DTRMM(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,double *b,const f77_int *ldb)
 {
-    dtrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    dtrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void dtrmm(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,double *b,const f77_int *ldb)
 {
-    dtrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    dtrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void DTRMM_(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,double *b,const f77_int *ldb)
 {
-    dtrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    dtrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void DTRMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    dtrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void dtrmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    dtrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void DTRMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    dtrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void DTRSM(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,double *b,const f77_int *ldb)
 {
-    dtrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    dtrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void dtrsm(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,double *b,const f77_int *ldb)
 {
-    dtrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    dtrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void DTRSM_(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const double *alpha,const double *a,const f77_int *lda,double *b,const f77_int *ldb)
 {
-    dtrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    dtrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void DTRSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    dtrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void dtrsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    dtrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void DTRSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const double *a,const f77_int *lda,double *x,const f77_int *incx)
 {
-    dtrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    dtrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 double DZASUM(const f77_int *n,const dcomplex *zx,const f77_int *incx)
 {
-    return dzasum_( n, zx, incx);
+    return dzasum_blis_impl( n, zx, incx);
 }
 
 double dzasum(const f77_int *n,const dcomplex *zx,const f77_int *incx)
 {
-    return dzasum_( n, zx, incx);
+    return dzasum_blis_impl( n, zx, incx);
 }
 
 double DZASUM_(const f77_int *n,const dcomplex *zx,const f77_int *incx)
 {
-    return dzasum_( n, zx, incx);
+    return dzasum_blis_impl( n, zx, incx);
 }
 
 double DZNRM2(const f77_int *n,const dcomplex *x,const f77_int *incx)
 {
-    return dznrm2_( n, x, incx);
+    return dznrm2_blis_impl( n, x, incx);
 }
 
 double dznrm2(const f77_int *n,const dcomplex *x,const f77_int *incx)
 {
-    return dznrm2_( n, x, incx);
+    return dznrm2_blis_impl( n, x, incx);
 }
 
 double DZNRM2_(const f77_int *n,const dcomplex *x,const f77_int *incx)
 {
-    return dznrm2_( n, x, incx);
+    return dznrm2_blis_impl( n, x, incx);
 }
 
 f77_int ICAMAX(const f77_int *n,const scomplex  *cx,const f77_int *incx)
 {
-    return icamax_( n, cx, incx);
+    return icamax_blis_impl( n, cx, incx);
 }
 
 f77_int icamax(const f77_int *n,const scomplex  *cx,const f77_int *incx)
 {
-    return icamax_( n, cx, incx);
+    return icamax_blis_impl( n, cx, incx);
 }
 
 f77_int ICAMAX_(const f77_int *n,const scomplex  *cx,const f77_int *incx)
 {
-    return icamax_( n, cx, incx);
+    return icamax_blis_impl( n, cx, incx);
 }
 
 f77_int IDAMAX(const f77_int *n,const double *dx,const f77_int *incx)
 {
-    return idamax_( n, dx, incx);
+    return idamax_blis_impl( n, dx, incx);
 }
 
 f77_int idamax(const f77_int *n,const double *dx,const f77_int *incx)
 {
-    return idamax_( n, dx, incx);
+    return idamax_blis_impl( n, dx, incx);
 }
 
 f77_int IDAMAX_(const f77_int *n,const double *dx,const f77_int *incx)
 {
-    return idamax_( n, dx, incx);
+    return idamax_blis_impl( n, dx, incx);
 }
 
 f77_int ISAMAX(const f77_int *n,const float  *sx,const f77_int *incx)
 {
-    return isamax_( n, sx, incx);
+    return isamax_blis_impl( n, sx, incx);
 }
 
 f77_int isamax(const f77_int *n,const float  *sx,const f77_int *incx)
 {
-    return isamax_( n, sx, incx);
+    return isamax_blis_impl( n, sx, incx);
 }
 
 f77_int ISAMAX_(const f77_int *n,const float  *sx,const f77_int *incx)
 {
-    return isamax_( n, sx, incx);
+    return isamax_blis_impl( n, sx, incx);
 }
 
 f77_int IZAMAX(const f77_int *n,const dcomplex *zx,const f77_int *incx)
 {
-    return izamax_( n, zx, incx);
+    return izamax_blis_impl( n, zx, incx);
 }
 
 f77_int izamax(const f77_int *n,const dcomplex *zx,const f77_int *incx)
 {
-    return izamax_( n, zx, incx);
+    return izamax_blis_impl( n, zx, incx);
 }
 
 f77_int IZAMAX_(const f77_int *n,const dcomplex *zx,const f77_int *incx)
 {
-    return izamax_( n, zx, incx);
+    return izamax_blis_impl( n, zx, incx);
 }
 
 f77_int LSAME(const char   *ca,const char   *cb,const f77_int a,const f77_int b)
 {
-    return lsame_( ca, cb, a, b);
+    return lsame_blis_impl( ca, cb, a, b);
 }
 
 f77_int LSAME_(const char   *ca,const char   *cb,const f77_int a,const f77_int b)
 {
-    return lsame_( ca, cb, a, b);
+    return lsame_blis_impl( ca, cb, a, b);
 }
 
 f77_int lsame(const char   *ca,const char   *cb,const f77_int a,const f77_int b)
 {
-    return lsame_( ca, cb, a, b);
+    return lsame_blis_impl( ca, cb, a, b);
 }
 
 float SASUM(const f77_int *n,const float  *sx, const f77_int *incx)
 {
-    return sasum_( n, sx, incx);
+    return sasum_blis_impl( n, sx, incx);
 }
 
 float sasum(const f77_int *n,const float  *sx, const f77_int *incx)
 {
-    return sasum_( n, sx, incx);
+    return sasum_blis_impl( n, sx, incx);
 }
 
 float SASUM_(const f77_int *n,const float  *sx, const f77_int *incx)
 {
-    return sasum_( n, sx, incx);
+    return sasum_blis_impl( n, sx, incx);
 }
 
 void SAXPY(const f77_int *n,const float  *sa,const float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    saxpy_( n, sa, sx, incx, sy, incy);
+    saxpy_blis_impl( n, sa, sx, incx, sy, incy);
 }
 
 void saxpy(const f77_int *n,const float  *sa,const float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    saxpy_( n, sa, sx, incx, sy, incy);
+    saxpy_blis_impl( n, sa, sx, incx, sy, incy);
 }
 
 void SAXPY_(const f77_int *n,const float  *sa,const float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    saxpy_( n, sa, sx, incx, sy, incy);
+    saxpy_blis_impl( n, sa, sx, incx, sy, incy);
 }
 
 
 float SCASUM(const f77_int *n,const scomplex  *cx, const f77_int *incx)
 {
-    return scasum_( n, cx, incx);
+    return scasum_blis_impl( n, cx, incx);
 }
 
 float scasum(const f77_int *n,const scomplex  *cx, const f77_int *incx)
 {
-    return scasum_( n, cx, incx);
+    return scasum_blis_impl( n, cx, incx);
 }
 
 float SCASUM_(const f77_int *n,const scomplex  *cx, const f77_int *incx)
 {
-    return scasum_( n, cx, incx);
+    return scasum_blis_impl( n, cx, incx);
 }
 
 
 
 float SCNRM2(const f77_int *n,const scomplex  *x, const f77_int *incx)
 {
-    return scnrm2_( n, x, incx);
+    return scnrm2_blis_impl( n, x, incx);
 }
 
 float scnrm2(const f77_int *n,const scomplex  *x, const f77_int *incx)
 {
-    return scnrm2_( n, x, incx);
+    return scnrm2_blis_impl( n, x, incx);
 }
 
 float SCNRM2_(const f77_int *n,const scomplex  *x, const f77_int *incx)
 {
-    return scnrm2_( n, x, incx);
+    return scnrm2_blis_impl( n, x, incx);
 }
 
 
 void SCOPY(const f77_int *n,const float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    scopy_( n, sx, incx, sy, incy);
+    scopy_blis_impl( n, sx, incx, sy, incy);
 }
 
 void scopy(const f77_int *n,const float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    scopy_( n, sx, incx, sy, incy);
+    scopy_blis_impl( n, sx, incx, sy, incy);
 }
 
 void SCOPY_(const f77_int *n,const float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    scopy_( n, sx, incx, sy, incy);
+    scopy_blis_impl( n, sx, incx, sy, incy);
 }
 
 
 float SDOT(const f77_int *n,const float  *sx, const f77_int *incx, const float  *sy, const f77_int *incy)
 {
-    return sdot_( n, sx, incx, sy, incy);
+    return sdot_blis_impl( n, sx, incx, sy, incy);
 }
 
 float sdot(const f77_int *n,const float  *sx, const f77_int *incx, const float  *sy, const f77_int *incy)
 {
-    return sdot_( n, sx, incx, sy, incy);
+    return sdot_blis_impl( n, sx, incx, sy, incy);
 }
 
 float SDOT_(const f77_int *n,const float  *sx, const f77_int *incx, const float  *sy, const f77_int *incy)
 {
-    return sdot_( n, sx, incx, sy, incy);
+    return sdot_blis_impl( n, sx, incx, sy, incy);
 }
 
 
 float SDSDOT(const f77_int *n,const float  *sb, const float  *sx, const f77_int *incx, const float  *sy, const f77_int *incy)
 {
-    return sdsdot_( n, sb, sx, incx, sy, incy);
+    return sdsdot_blis_impl( n, sb, sx, incx, sy, incy);
 }
 
 float sdsdot(const f77_int *n,const float  *sb, const float  *sx, const f77_int *incx, const float  *sy, const f77_int *incy)
 {
-    return sdsdot_( n, sb, sx, incx, sy, incy);
+    return sdsdot_blis_impl( n, sb, sx, incx, sy, incy);
 }
 
 float SDSDOT_(const f77_int *n,const float  *sb, const float  *sx, const f77_int *incx, const float  *sy, const f77_int *incy)
 {
-    return sdsdot_( n, sb, sx, incx, sy, incy);
+    return sdsdot_blis_impl( n, sb, sx, incx, sy, incy);
 }
 
 
 void SGBMV(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    sgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void sgbmv(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    sgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void SGBMV_(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    sgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void SGEMM(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    sgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    sgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void sgemm(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    sgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    sgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void SGEMM_(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    sgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    sgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void SGEMV(const char   *trans,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    sgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void sgemv(const char   *trans,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    sgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void SGEMV_(const char   *trans,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    sgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void SGER(const f77_int *m,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *a,const f77_int *lda)
 {
-    sger_( m, n, alpha, x, incx, y, incy, a, lda);
+    sger_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void sger(const f77_int *m,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *a,const f77_int *lda)
 {
-    sger_( m, n, alpha, x, incx, y, incy, a, lda);
+    sger_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void SGER_(const f77_int *m,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *a,const f77_int *lda)
 {
-    sger_( m, n, alpha, x, incx, y, incy, a, lda);
+    sger_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 
 float SNRM2(const f77_int *n,const float  *x, const f77_int *incx)
 {
-    return snrm2_( n, x, incx);
+    return snrm2_blis_impl( n, x, incx);
 }
 
 float snrm2(const f77_int *n,const float  *x, const f77_int *incx)
 {
-    return snrm2_( n, x, incx);
+    return snrm2_blis_impl( n, x, incx);
 }
 
 float SNRM2_(const f77_int *n,const float  *x, const f77_int *incx)
 {
-    return snrm2_( n, x, incx);
+    return snrm2_blis_impl( n, x, incx);
 }
 
 
 void SROT(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy,const float  *c,const float  *s)
 {
-    srot_( n, sx, incx, sy, incy, c, s);
+    srot_blis_impl( n, sx, incx, sy, incy, c, s);
 }
 
 void srot(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy,const float  *c,const float  *s)
 {
-    srot_( n, sx, incx, sy, incy, c, s);
+    srot_blis_impl( n, sx, incx, sy, incy, c, s);
 }
 
 void SROT_(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy,const float  *c,const float  *s)
 {
-    srot_( n, sx, incx, sy, incy, c, s);
+    srot_blis_impl( n, sx, incx, sy, incy, c, s);
 }
 
 void SROTG(float  *sa,float  *sb,float  *c,float  *s)
 {
-    srotg_( sa, sb, c, s);
+    srotg_blis_impl( sa, sb, c, s);
 }
 
 void srotg(float  *sa,float  *sb,float  *c,float  *s)
 {
-    srotg_( sa, sb, c, s);
+    srotg_blis_impl( sa, sb, c, s);
 }
 
 void SROTG_(float  *sa,float  *sb,float  *c,float  *s)
 {
-    srotg_( sa, sb, c, s);
+    srotg_blis_impl( sa, sb, c, s);
 }
 
 void SROTM(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy,const float  *sparam)
 {
-    srotm_( n, sx, incx, sy, incy, sparam);
+    srotm_blis_impl( n, sx, incx, sy, incy, sparam);
 }
 
 void srotm(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy,const float  *sparam)
 {
-    srotm_( n, sx, incx, sy, incy, sparam);
+    srotm_blis_impl( n, sx, incx, sy, incy, sparam);
 }
 
 void SROTM_(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy,const float  *sparam)
 {
-    srotm_( n, sx, incx, sy, incy, sparam);
+    srotm_blis_impl( n, sx, incx, sy, incy, sparam);
 }
 
 void SROTMG(float  *sd1,float  *sd2,float  *sx1,const float  *sy1,float  *sparam)
 {
-    srotmg_( sd1, sd2, sx1, sy1, sparam);
+    srotmg_blis_impl( sd1, sd2, sx1, sy1, sparam);
 }
 
 void srotmg(float  *sd1,float  *sd2,float  *sx1,const float  *sy1,float  *sparam)
 {
-    srotmg_( sd1, sd2, sx1, sy1, sparam);
+    srotmg_blis_impl( sd1, sd2, sx1, sy1, sparam);
 }
 
 void SROTMG_(float  *sd1,float  *sd2,float  *sx1,const float  *sy1,float  *sparam)
 {
-    srotmg_( sd1, sd2, sx1, sy1, sparam);
+    srotmg_blis_impl( sd1, sd2, sx1, sy1, sparam);
 }
 
 void SSBMV(const char   *uplo,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    ssbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    ssbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ssbmv(const char   *uplo,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    ssbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    ssbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void SSBMV_(const char   *uplo,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    ssbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    ssbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void SSCAL(const f77_int *n,const float  *sa,float  *sx,const f77_int *incx)
 {
-    sscal_( n, sa, sx, incx);
+    sscal_blis_impl( n, sa, sx, incx);
 }
 
 void sscal(const f77_int *n,const float  *sa,float  *sx,const f77_int *incx)
 {
-    sscal_( n, sa, sx, incx);
+    sscal_blis_impl( n, sa, sx, incx);
 }
 
 void SSCAL_(const f77_int *n,const float  *sa,float  *sx,const f77_int *incx)
 {
-    sscal_( n, sa, sx, incx);
+    sscal_blis_impl( n, sa, sx, incx);
 }
 
 void SSPMV(const char   *uplo,const f77_int *n,const float  *alpha,const float  *ap,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sspmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    sspmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void sspmv(const char   *uplo,const f77_int *n,const float  *alpha,const float  *ap,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sspmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    sspmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void SSPMV_(const char   *uplo,const f77_int *n,const float  *alpha,const float  *ap,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    sspmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    sspmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void SSPR(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,float  *ap)
 {
-    sspr_( uplo, n, alpha, x, incx, ap);
+    sspr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void sspr(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,float  *ap)
 {
-    sspr_( uplo, n, alpha, x, incx, ap);
+    sspr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void SSPR_(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,float  *ap)
 {
-    sspr_( uplo, n, alpha, x, incx, ap);
+    sspr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void SSPR2(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *ap)
 {
-    sspr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    sspr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void sspr2(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *ap)
 {
-    sspr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    sspr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void SSPR2_(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *ap)
 {
-    sspr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    sspr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void SSWAP(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    sswap_( n, sx, incx, sy, incy);
+    sswap_blis_impl( n, sx, incx, sy, incy);
 }
 
 void sswap(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    sswap_( n, sx, incx, sy, incy);
+    sswap_blis_impl( n, sx, incx, sy, incy);
 }
 
 void SSWAP_(const f77_int *n,float  *sx,const f77_int *incx,float  *sy,const f77_int *incy)
 {
-    sswap_( n, sx, incx, sy, incy);
+    sswap_blis_impl( n, sx, incx, sy, incy);
 }
 
 void SSYMM(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    ssymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ssymm(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    ssymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void SSYMM_(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    ssymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void SSYMV(const char   *uplo,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    ssymv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    ssymv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ssymv(const char   *uplo,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    ssymv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    ssymv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void SSYMV_(const char   *uplo,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,const float  *x,const f77_int *incx,const float  *beta,float  *y,const f77_int *incy)
 {
-    ssymv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    ssymv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void SSYR(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,float  *a,const f77_int *lda)
 {
-    ssyr_( uplo, n, alpha, x, incx, a, lda);
+    ssyr_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void ssyr(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,float  *a,const f77_int *lda)
 {
-    ssyr_( uplo, n, alpha, x, incx, a, lda);
+    ssyr_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void SSYR_(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,float  *a,const f77_int *lda)
 {
-    ssyr_( uplo, n, alpha, x, incx, a, lda);
+    ssyr_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void SSYR2(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *a,const f77_int *lda)
 {
-    ssyr2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    ssyr2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void ssyr2(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *a,const f77_int *lda)
 {
-    ssyr2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    ssyr2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void SSYR2_(const char   *uplo,const f77_int *n,const float  *alpha,const float  *x,const f77_int *incx,const float  *y,const f77_int *incy,float  *a,const f77_int *lda)
 {
-    ssyr2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    ssyr2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void SSYR2K(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    ssyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ssyr2k(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    ssyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void SSYR2K_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *b,const f77_int *ldb,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    ssyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void SSYRK(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    ssyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void ssyrk(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    ssyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void SSYRK_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const float  *alpha,const float  *a,const f77_int *lda,const float  *beta,float  *c,const f77_int *ldc)
 {
-    ssyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    ssyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void STBMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    stbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    stbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void stbmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    stbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    stbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void STBMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    stbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    stbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void STBSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    stbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    stbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void stbsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    stbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    stbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void STBSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    stbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    stbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void STPMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *ap,float  *x,const f77_int *incx)
 {
-    stpmv_( uplo, trans, diag, n, ap, x, incx);
+    stpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void stpmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *ap,float  *x,const f77_int *incx)
 {
-    stpmv_( uplo, trans, diag, n, ap, x, incx);
+    stpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void STPMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *ap,float  *x,const f77_int *incx)
 {
-    stpmv_( uplo, trans, diag, n, ap, x, incx);
+    stpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void STPSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *ap,float  *x,const f77_int *incx)
 {
-    stpsv_( uplo, trans, diag, n, ap, x, incx);
+    stpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void stpsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *ap,float  *x,const f77_int *incx)
 {
-    stpsv_( uplo, trans, diag, n, ap, x, incx);
+    stpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void STPSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *ap,float  *x,const f77_int *incx)
 {
-    stpsv_( uplo, trans, diag, n, ap, x, incx);
+    stpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void STRMM(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,float  *b,const f77_int *ldb)
 {
-    strmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    strmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void strmm(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,float  *b,const f77_int *ldb)
 {
-    strmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    strmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void STRMM_(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,float  *b,const f77_int *ldb)
 {
-    strmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    strmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void STRMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    strmv_( uplo, trans, diag, n, a, lda, x, incx);
+    strmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void strmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    strmv_( uplo, trans, diag, n, a, lda, x, incx);
+    strmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void STRMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    strmv_( uplo, trans, diag, n, a, lda, x, incx);
+    strmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void STRSM(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,float  *b,const f77_int *ldb)
 {
-    strsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    strsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void strsm(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,float  *b,const f77_int *ldb)
 {
-    strsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    strsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void STRSM_(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const float  *alpha,const float  *a,const f77_int *lda,float  *b,const f77_int *ldb)
 {
-    strsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    strsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void STRSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    strsv_( uplo, trans, diag, n, a, lda, x, incx);
+    strsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void strsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    strsv_( uplo, trans, diag, n, a, lda, x, incx);
+    strsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void STRSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const float  *a,const f77_int *lda,float  *x,const f77_int *incx)
 {
-    strsv_( uplo, trans, diag, n, a, lda, x, incx);
+    strsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 int XERBLA(const char   *srname,const f77_int *info, ftnlen n)
 {
-    return xerbla_( srname, info, n);
+    return xerbla_blis_impl( srname, info, n);
 }
 
 int XERBLA_(const char   *srname,const f77_int *info, ftnlen n)
 {
-    return xerbla_( srname, info, n);
+    return xerbla_blis_impl( srname, info, n);
 }
 
 int xerbla(const char   *srname,const f77_int *info, ftnlen n)
 {
-    return xerbla_( srname, info, n);
+    return xerbla_blis_impl( srname, info, n);
 }
 
 void ZAXPY(const f77_int *n,const dcomplex *za,const dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zaxpy_( n, za, zx, incx, zy, incy);
+    zaxpy_blis_impl( n, za, zx, incx, zy, incy);
 }
 
 void zaxpy(const f77_int *n,const dcomplex *za,const dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zaxpy_( n, za, zx, incx, zy, incy);
+    zaxpy_blis_impl( n, za, zx, incx, zy, incy);
 }
 
 void ZAXPY_(const f77_int *n,const dcomplex *za,const dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zaxpy_( n, za, zx, incx, zy, incy);
+    zaxpy_blis_impl( n, za, zx, incx, zy, incy);
 }
 
 void ZCOPY(const f77_int *n,const dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zcopy_( n, zx, incx, zy, incy);
+    zcopy_blis_impl( n, zx, incx, zy, incy);
 }
 
 void zcopy(const f77_int *n,const dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zcopy_( n, zx, incx, zy, incy);
+    zcopy_blis_impl( n, zx, incx, zy, incy);
 }
 
 void ZCOPY_(const f77_int *n,const dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zcopy_( n, zx, incx, zy, incy);
+    zcopy_blis_impl( n, zx, incx, zy, incy);
 }
 
 void ZDROT(const f77_int *n,dcomplex *cx,const f77_int *incx,dcomplex *cy,const f77_int *incy,const double *c,const double *s)
 {
-    zdrot_( n, cx, incx, cy, incy, c, s);
+    zdrot_blis_impl( n, cx, incx, cy, incy, c, s);
 }
 
 void zdrot(const f77_int *n,dcomplex *cx,const f77_int *incx,dcomplex *cy,const f77_int *incy,const double *c,const double *s)
 {
-    zdrot_( n, cx, incx, cy, incy, c, s);
+    zdrot_blis_impl( n, cx, incx, cy, incy, c, s);
 }
 
 void ZDROT_(const f77_int *n,dcomplex *cx,const f77_int *incx,dcomplex *cy,const f77_int *incy,const double *c,const double *s)
 {
-    zdrot_( n, cx, incx, cy, incy, c, s);
+    zdrot_blis_impl( n, cx, incx, cy, incy, c, s);
 }
 
 void ZDSCAL(const f77_int *n,const double *da,dcomplex *zx,const f77_int *incx)
 {
-    zdscal_( n, da, zx, incx);
+    zdscal_blis_impl( n, da, zx, incx);
 }
 
 void zdscal(const f77_int *n,const double *da,dcomplex *zx,const f77_int *incx)
 {
-    zdscal_( n, da, zx, incx);
+    zdscal_blis_impl( n, da, zx, incx);
 }
 
 void ZDSCAL_(const f77_int *n,const double *da,dcomplex *zx,const f77_int *incx)
 {
-    zdscal_( n, da, zx, incx);
+    zdscal_blis_impl( n, da, zx, incx);
 }
 
 void ZGBMV(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    zgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void zgbmv(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    zgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ZGBMV_(const char   *trans,const f77_int *m,const f77_int *n,const f77_int *kl,const f77_int *ku,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zgbmv_( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
+    zgbmv_blis_impl( trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ZGEMM(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void zgemm(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZGEMM_(const char   *transa,const char   *transb,const f77_int *m,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zgemm_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemm_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZGEMV(const char   *trans,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    zgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void zgemv(const char   *trans,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    zgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ZGEMV_(const char   *trans,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zgemv_( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
+    zgemv_blis_impl( trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ZGERC(const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zgerc_( m, n, alpha, x, incx, y, incy, a, lda);
+    zgerc_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void zgerc(const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zgerc_( m, n, alpha, x, incx, y, incy, a, lda);
+    zgerc_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void ZGERC_(const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zgerc_( m, n, alpha, x, incx, y, incy, a, lda);
+    zgerc_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void ZGERU(const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zgeru_( m, n, alpha, x, incx, y, incy, a, lda);
+    zgeru_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void zgeru(const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zgeru_( m, n, alpha, x, incx, y, incy, a, lda);
+    zgeru_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void ZGERU_(const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zgeru_( m, n, alpha, x, incx, y, incy, a, lda);
+    zgeru_blis_impl( m, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void ZHBMV(const char   *uplo,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    zhbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void zhbmv(const char   *uplo,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    zhbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ZHBMV_(const char   *uplo,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhbmv_( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
+    zhbmv_blis_impl( uplo, n, k, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ZHEMM(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zhemm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    zhemm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void zhemm(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zhemm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    zhemm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZHEMM_(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zhemm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    zhemm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZHEMV(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhemv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    zhemv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void zhemv(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhemv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    zhemv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ZHEMV_(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhemv_( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
+    zhemv_blis_impl( uplo, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 
 void ZHER(const char   *uplo,const f77_int *n,const double *alpha,const dcomplex *x,const f77_int *incx,dcomplex *a,const f77_int *lda)
 {
-    zher_( uplo, n, alpha, x, incx, a, lda);
+    zher_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void zher(const char   *uplo,const f77_int *n,const double *alpha,const dcomplex *x,const f77_int *incx,dcomplex *a,const f77_int *lda)
 {
-    zher_( uplo, n, alpha, x, incx, a, lda);
+    zher_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void ZHER_(const char   *uplo,const f77_int *n,const double *alpha,const dcomplex *x,const f77_int *incx,dcomplex *a,const f77_int *lda)
 {
-    zher_( uplo, n, alpha, x, incx, a, lda);
+    zher_blis_impl( uplo, n, alpha, x, incx, a, lda);
 }
 
 void ZHER2(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zher2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    zher2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void zher2(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zher2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    zher2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void ZHER2_(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *a,const f77_int *lda)
 {
-    zher2_( uplo, n, alpha, x, incx, y, incy, a, lda);
+    zher2_blis_impl( uplo, n, alpha, x, incx, y, incy, a, lda);
 }
 
 void ZHER2K(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const double *beta,dcomplex *c,const f77_int *ldc)
 {
-    zher2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zher2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void zher2k(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const double *beta,dcomplex *c,const f77_int *ldc)
 {
-    zher2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zher2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZHER2K_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const double *beta,dcomplex *c,const f77_int *ldc)
 {
-    zher2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zher2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZHERK(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const dcomplex *a,const f77_int *lda,const double *beta,dcomplex *c,const f77_int *ldc)
 {
-    zherk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    zherk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void zherk(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const dcomplex *a,const f77_int *lda,const double *beta,dcomplex *c,const f77_int *ldc)
 {
-    zherk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    zherk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void ZHERK_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const double *alpha,const dcomplex *a,const f77_int *lda,const double *beta,dcomplex *c,const f77_int *ldc)
 {
-    zherk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    zherk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void ZHPMV(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *ap,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhpmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    zhpmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void zhpmv(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *ap,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhpmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    zhpmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void ZHPMV_(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *ap,const dcomplex *x,const f77_int *incx,const dcomplex *beta,dcomplex *y,const f77_int *incy)
 {
-    zhpmv_( uplo, n, alpha, ap, x, incx, beta, y, incy);
+    zhpmv_blis_impl( uplo, n, alpha, ap, x, incx, beta, y, incy);
 }
 
 void ZHPR(const char   *uplo,const f77_int *n,const bla_double *alpha,const dcomplex *x,const f77_int *incx,dcomplex *ap)
 {
-    zhpr_( uplo, n, alpha, x, incx, ap);
+    zhpr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void zhpr(const char   *uplo,const f77_int *n,const bla_double *alpha,const dcomplex *x,const f77_int *incx,dcomplex *ap)
 {
-    zhpr_( uplo, n, alpha, x, incx, ap);
+    zhpr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void ZHPR_(const char   *uplo,const f77_int *n,const bla_double *alpha,const dcomplex *x,const f77_int *incx,dcomplex *ap)
 {
-    zhpr_( uplo, n, alpha, x, incx, ap);
+    zhpr_blis_impl( uplo, n, alpha, x, incx, ap);
 }
 
 void ZHPR2(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *ap)
 {
-    zhpr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    zhpr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void zhpr2(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *ap)
 {
-    zhpr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    zhpr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void ZHPR2_(const char   *uplo,const f77_int *n,const dcomplex *alpha,const dcomplex *x,const f77_int *incx,const dcomplex *y,const f77_int *incy,dcomplex *ap)
 {
-    zhpr2_( uplo, n, alpha, x, incx, y, incy, ap);
+    zhpr2_blis_impl( uplo, n, alpha, x, incx, y, incy, ap);
 }
 
 void ZROTG(dcomplex *ca,bla_dcomplex *cb,bla_double *c,dcomplex *s)
 {
-    zrotg_( ca, cb, c, s);
+    zrotg_blis_impl( ca, cb, c, s);
 }
 
 void zrotg(dcomplex *ca,bla_dcomplex *cb,bla_double *c,dcomplex *s)
 {
-    zrotg_( ca, cb, c, s);
+    zrotg_blis_impl( ca, cb, c, s);
 }
 
 void ZROTG_(dcomplex *ca,bla_dcomplex *cb,bla_double *c,dcomplex *s)
 {
-    zrotg_( ca, cb, c, s);
+    zrotg_blis_impl( ca, cb, c, s);
 }
 
 void ZSCAL(const f77_int *n,const dcomplex *za,dcomplex *zx,const f77_int *incx)
 {
-    zscal_( n, za, zx, incx);
+    zscal_blis_impl( n, za, zx, incx);
 }
 
 void zscal(const f77_int *n,const dcomplex *za,dcomplex *zx,const f77_int *incx)
 {
-    zscal_( n, za, zx, incx);
+    zscal_blis_impl( n, za, zx, incx);
 }
 
 void ZSCAL_(const f77_int *n,const dcomplex *za,dcomplex *zx,const f77_int *incx)
 {
-    zscal_( n, za, zx, incx);
+    zscal_blis_impl( n, za, zx, incx);
 }
 
 void ZSWAP(const f77_int *n,dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zswap_( n, zx, incx, zy, incy);
+    zswap_blis_impl( n, zx, incx, zy, incy);
 }
 
 void zswap(const f77_int *n,dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zswap_( n, zx, incx, zy, incy);
+    zswap_blis_impl( n, zx, incx, zy, incy);
 }
 
 void ZSWAP_(const f77_int *n,dcomplex *zx,const f77_int *incx,dcomplex *zy,const f77_int *incy)
 {
-    zswap_( n, zx, incx, zy, incy);
+    zswap_blis_impl( n, zx, incx, zy, incy);
 }
 
 void ZSYMM(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    zsymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void zsymm(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    zsymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZSYMM_(const char   *side,const char   *uplo,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsymm_( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
+    zsymm_blis_impl( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZSYR2K(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zsyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void zsyr2k(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zsyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZSYR2K_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *b,const f77_int *ldb,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsyr2k_( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zsyr2k_blis_impl( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZSYRK(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    zsyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void zsyrk(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    zsyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void ZSYRK_(const char   *uplo,const char   *trans,const f77_int *n,const f77_int *k,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,const dcomplex *beta,dcomplex *c,const f77_int *ldc)
 {
-    zsyrk_( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
+    zsyrk_blis_impl( uplo, trans, n, k, alpha, a, lda, beta, c, ldc);
 }
 
 void ZTBMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ztbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void ztbmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ztbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void ZTBMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztbmv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ztbmv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void ZTBSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ztbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void ztbsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ztbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void ZTBSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const f77_int *k,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztbsv_( uplo, trans, diag, n, k, a, lda, x, incx);
+    ztbsv_blis_impl( uplo, trans, diag, n, k, a, lda, x, incx);
 }
 
 void ZTPMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *ap,dcomplex *x,const f77_int *incx)
 {
-    ztpmv_( uplo, trans, diag, n, ap, x, incx);
+    ztpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void ztpmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *ap,dcomplex *x,const f77_int *incx)
 {
-    ztpmv_( uplo, trans, diag, n, ap, x, incx);
+    ztpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void ZTPMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *ap,dcomplex *x,const f77_int *incx)
 {
-    ztpmv_( uplo, trans, diag, n, ap, x, incx);
+    ztpmv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void ZTPSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *ap,dcomplex *x,const f77_int *incx)
 {
-    ztpsv_( uplo, trans, diag, n, ap, x, incx);
+    ztpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void ztpsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *ap,dcomplex *x,const f77_int *incx)
 {
-    ztpsv_( uplo, trans, diag, n, ap, x, incx);
+    ztpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void ZTPSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *ap,dcomplex *x,const f77_int *incx)
 {
-    ztpsv_( uplo, trans, diag, n, ap, x, incx);
+    ztpsv_blis_impl( uplo, trans, diag, n, ap, x, incx);
 }
 
 void ZTRMM(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,dcomplex *b,const f77_int *ldb)
 {
-    ztrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ztrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void ztrmm(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,dcomplex *b,const f77_int *ldb)
 {
-    ztrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ztrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void ZTRMM_(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,dcomplex *b,const f77_int *ldb)
 {
-    ztrmm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ztrmm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void ZTRMV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    ztrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void ztrmv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    ztrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void ZTRMV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztrmv_( uplo, trans, diag, n, a, lda, x, incx);
+    ztrmv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void ZTRSM(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,dcomplex *b,const f77_int *ldb)
 {
-    ztrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ztrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void ztrsm(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,dcomplex *b,const f77_int *ldb)
 {
-    ztrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ztrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void ZTRSM_(const char   *side,const char   *uplo,const char   *transa,const char   *diag,const f77_int *m,const f77_int *n,const dcomplex *alpha,const dcomplex *a,const f77_int *lda,dcomplex *b,const f77_int *ldb)
 {
-    ztrsm_( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
+    ztrsm_blis_impl( side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb);
 }
 
 void ZTRSV(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    ztrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void ztrsv(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    ztrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
 void ZTRSV_(const char   *uplo,const char   *trans,const char   *diag,const f77_int *n,const dcomplex *a,const f77_int *lda,dcomplex *x,const f77_int *incx)
 {
-    ztrsv_( uplo, trans, diag, n, a, lda, x, incx);
+    ztrsv_blis_impl( uplo, trans, diag, n, a, lda, x, incx);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
 
 void CDOTCSUB( const f77_int* n, const scomplex* x,const f77_int* incx, const scomplex* y, const f77_int* incy, scomplex* rval)
 {
-    cdotcsub_( n, x, incx, y, incy, rval);
+    cdotcsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void cdotcsub( const f77_int* n, const scomplex* x,const f77_int* incx, const scomplex* y, const f77_int* incy, scomplex* rval)
 {
-    cdotcsub_( n, x, incx, y, incy, rval);
+    cdotcsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void CDOTCSUB_( const f77_int* n, const scomplex* x,const f77_int* incx, const scomplex* y, const f77_int* incy, scomplex* rval)
 {
-    cdotcsub_( n, x, incx, y, incy, rval);
+    cdotcsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void CDOTUSUB( const f77_int* n, const scomplex* x,const f77_int* incxy, const scomplex* y, const f77_int* incy, scomplex* rval)
 {
-    cdotusub_( n, x, incxy, y, incy, rval);
+    cdotusub_blis_impl( n, x, incxy, y, incy, rval);
 }
 
 void cdotusub( const f77_int* n, const scomplex* x,const f77_int* incxy, const scomplex* y, const f77_int* incy, scomplex* rval)
 {
-    cdotusub_( n, x, incxy, y, incy, rval);
+    cdotusub_blis_impl( n, x, incxy, y, incy, rval);
 }
 
 void CDOTUSUB_( const f77_int* n, const scomplex* x,const f77_int* incxy, const scomplex* y, const f77_int* incy, scomplex* rval)
 {
-    cdotusub_( n, x, incxy, y, incy, rval);
+    cdotusub_blis_impl( n, x, incxy, y, incy, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 void CGEMM3M( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const scomplex* alpha, const scomplex* a, const f77_int* lda, const scomplex* b, const f77_int* ldb, const scomplex* beta, scomplex* c, const f77_int* ldc)
 {
-    cgemm3m_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemm3m_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void cgemm3m( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const scomplex* alpha, const scomplex* a, const f77_int* lda, const scomplex* b, const f77_int* ldb, const scomplex* beta, scomplex* c, const f77_int* ldc)
 {
-    cgemm3m_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemm3m_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CGEMM3M_( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const scomplex* alpha, const scomplex* a, const f77_int* lda, const scomplex* b, const f77_int* ldb, const scomplex* beta, scomplex* c, const f77_int* ldc)
 {
-    cgemm3m_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemm3m_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CGEMM_BATCH( const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const scomplex* alpha_array, const scomplex** a_array, const  f77_int *lda_array, const scomplex** b_array, const f77_int *ldb_array, const scomplex* beta_array, scomplex** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    cgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    cgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void cgemm_batch( const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const scomplex* alpha_array, const scomplex** a_array, const  f77_int *lda_array, const scomplex** b_array, const f77_int *ldb_array, const scomplex* beta_array, scomplex** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    cgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    cgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void CGEMM_BATCH_( const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const scomplex* alpha_array, const scomplex** a_array, const  f77_int *lda_array, const scomplex** b_array, const f77_int *ldb_array, const scomplex* beta_array, scomplex** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    cgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    cgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void CGEMMT( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  scomplex* alpha, const scomplex* a, const f77_int* lda, const scomplex* b, const f77_int* ldb, const scomplex* beta, scomplex* c, const f77_int* ldc)
 {
-    cgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void cgemmt( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  scomplex* alpha, const scomplex* a, const f77_int* lda, const scomplex* b, const f77_int* ldb, const scomplex* beta, scomplex* c, const f77_int* ldc)
 {
-    cgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void CGEMMT_( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  scomplex* alpha, const scomplex* a, const f77_int* lda, const scomplex* b, const f77_int* ldb, const scomplex* beta, scomplex* c, const f77_int* ldc)
 {
-    cgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    cgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+//#ifdef BLIS_ENABLE_CBLAS
+
 void CIMATCOPY(f77_char* trans, f77_int* rows, f77_int* cols, const scomplex* alpha,scomplex* aptr, f77_int* lda, f77_int* ldb)
 {
     cimatcopy_( trans, rows, cols, alpha, aptr, lda, ldb);
@@ -2483,96 +2504,112 @@ void COMATCOPY_(f77_char* trans, f77_int* rows, f77_int* cols, const scomplex* a
     comatcopy_( trans, rows, cols, alpha, aptr, lda, bptr, ldb);
 }
 
+//#endif // BLIS_ENABLE_CBLAS
+
+#ifdef BLIS_ENABLE_CBLAS
+
 void DASUMSUB(const f77_int* n, const double* x, const f77_int* incx, double* rval)
 {
-    dasumsub_( n, x, incx, rval);
+    dasumsub_blis_impl( n, x, incx, rval);
 }
 
 void dasumsub(const f77_int* n, const double* x, const f77_int* incx, double* rval)
 {
-    dasumsub_( n, x, incx, rval);
+    dasumsub_blis_impl( n, x, incx, rval);
 }
 
 void DASUMSUB_(const f77_int* n, const double* x, const f77_int* incx, double* rval)
 {
-    dasumsub_( n, x, incx, rval);
+    dasumsub_blis_impl( n, x, incx, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 void DAXPBY(const f77_int* n, const double* alpha, const double *x, const f77_int* incx, const double* beta, double *y, const f77_int* incy)
 {
-    daxpby_( n, alpha, x, incx, beta, y, incy);
+    daxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
 void daxpby(const f77_int* n, const double* alpha, const double *x, const f77_int* incx, const double* beta, double *y, const f77_int* incy)
 {
-    daxpby_( n, alpha, x, incx, beta, y, incy);
+    daxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
 void DAXPBY_(const f77_int* n, const double* alpha, const double *x, const f77_int* incx, const double* beta, double *y, const f77_int* incy)
 {
-    daxpby_( n, alpha, x, incx, beta, y, incy);
+    daxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void DDOTSUB(const f77_int* n, const double* x, const f77_int* incx, const double* y, const f77_int* incy, double* rval)
 {
-    ddotsub_( n, x, incx, y, incy, rval);
+    ddotsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void ddotsub(const f77_int* n, const double* x, const f77_int* incx, const double* y, const f77_int* incy, double* rval)
 {
-    ddotsub_( n, x, incx, y, incy, rval);
+    ddotsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void DDOTSUB_(const f77_int* n, const double* x, const f77_int* incx, const double* y, const f77_int* incy, double* rval)
 {
-    ddotsub_( n, x, incx, y, incy, rval);
+    ddotsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 void DGEMM_BATCH( const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const double* alpha_array, const double** a_array, const  f77_int *lda_array, const double** b_array, const f77_int *ldb_array, const double* beta_array, double** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    dgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    dgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void dgemm_batch( const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const double* alpha_array, const double** a_array, const  f77_int *lda_array, const double** b_array, const f77_int *ldb_array, const double* beta_array, double** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    dgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    dgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void DGEMM_BATCH_( const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const double* alpha_array, const double** a_array, const  f77_int *lda_array, const double** b_array, const f77_int *ldb_array, const double* beta_array, double** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    dgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    dgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void DGEMMT( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  double* alpha, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc)
 {
-    dgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void dgemmt( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  double* alpha, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc)
 {
-    dgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void DGEMMT_( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  double* alpha, const double* a, const f77_int* lda, const double* b, const f77_int* ldb, const double* beta, double* c, const f77_int* ldc)
 {
-    dgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    dgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void DNRM2SUB(const f77_int* n, const double* x, const f77_int* incx, double *rval)
 {
-    dnrm2sub_( n, x, incx, rval);
+    dnrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void dnrm2sub(const f77_int* n, const double* x, const f77_int* incx, double *rval)
 {
-    dnrm2sub_( n, x, incx, rval);
+    dnrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void DNRM2SUB_(const f77_int* n, const double* x, const f77_int* incx, double *rval)
 {
-    dnrm2sub_( n, x, incx, rval);
+    dnrm2sub_blis_impl( n, x, incx, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
+//#ifdef BLIS_ENABLE_CBLAS
+
 void DOMATADD(f77_char* transa,f77_char* transb, f77_int* m, f77_int* n, const double* alpha, const double* A, f77_int* lda, const double* beta, const double* B, f77_int* ldb, double* C, f77_int* ldc)
 {
     domatadd_( transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
@@ -2618,321 +2655,349 @@ void DOMATCOPY_(f77_char* trans, f77_int* rows, f77_int* cols, const double* alp
     domatcopy_( trans, rows, cols, alpha, aptr, lda, bptr, ldb);
 }
 
+//#endif // BLIS_ENABLE_CBLAS
+
+#ifdef BLIS_ENABLE_CBLAS
+
 void DZASUMSUB(const f77_int* n, const dcomplex* x, const f77_int* incx, double* rval)
 {
-    dzasumsub_( n, x, incx, rval);
+    dzasumsub_blis_impl( n, x, incx, rval);
 }
 
 void dzasumsub(const f77_int* n, const dcomplex* x, const f77_int* incx, double* rval)
 {
-    dzasumsub_( n, x, incx, rval);
+    dzasumsub_blis_impl( n, x, incx, rval);
 }
 
 void DZASUMSUB_(const f77_int* n, const dcomplex* x, const f77_int* incx, double* rval)
 {
-    dzasumsub_( n, x, incx, rval);
+    dzasumsub_blis_impl( n, x, incx, rval);
 }
 
 void DZNRM2SUB(const f77_int* n, const dcomplex* x, const f77_int* incx, double* rval)
 {
-    dznrm2sub_( n, x, incx, rval);
+    dznrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void dznrm2sub(const f77_int* n, const dcomplex* x, const f77_int* incx, double* rval)
 {
-    dznrm2sub_( n, x, incx, rval);
+    dznrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void DZNRM2SUB_(const f77_int* n, const dcomplex* x, const f77_int* incx, double* rval)
 {
-    dznrm2sub_( n, x, incx, rval);
+    dznrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void ICAMAXSUB(const f77_int* n, const scomplex* x, const f77_int* incx, f77_int* rval)
 {
-    icamaxsub_( n, x, incx, rval);
+    icamaxsub_blis_impl( n, x, incx, rval);
 }
 
 void icamaxsub(const f77_int* n, const scomplex* x, const f77_int* incx, f77_int* rval)
 {
-    icamaxsub_( n, x, incx, rval);
+    icamaxsub_blis_impl( n, x, incx, rval);
 }
 
 void ICAMAXSUB_(const f77_int* n, const scomplex* x, const f77_int* incx, f77_int* rval)
 {
-    icamaxsub_( n, x, incx, rval);
+    icamaxsub_blis_impl( n, x, incx, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 f77_int ICAMIN( const f77_int* n, const scomplex* x, const f77_int* incx)
 {
-    return icamin_( n, x, incx);
+    return icamin_blis_impl( n, x, incx);
 }
 
 f77_int icamin( const f77_int* n, const scomplex* x, const f77_int* incx)
 {
-    return icamin_( n, x, incx);
+    return icamin_blis_impl( n, x, incx);
 }
 
 f77_int ICAMIN_( const f77_int* n, const scomplex* x, const f77_int* incx)
 {
-    return icamin_( n, x, incx);
+    return icamin_blis_impl( n, x, incx);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void ICAMINSUB( const f77_int* n, const scomplex* x, const f77_int* incx, f77_int* rval)
 {
-    icaminsub_( n, x, incx, rval);
+    icaminsub_blis_impl( n, x, incx, rval);
 }
 
 void icaminsub( const f77_int* n, const scomplex* x, const f77_int* incx, f77_int* rval)
 {
-    icaminsub_( n, x, incx, rval);
+    icaminsub_blis_impl( n, x, incx, rval);
 }
 
 void ICAMINSUB_( const f77_int* n, const scomplex* x, const f77_int* incx, f77_int* rval)
 {
-    icaminsub_( n, x, incx, rval);
+    icaminsub_blis_impl( n, x, incx, rval);
 }
 
 void IDAMAXSUB( const f77_int* n, const double* x, const f77_int* incx, f77_int* rval)
 {
-    idamaxsub_( n, x, incx, rval);
+    idamaxsub_blis_impl( n, x, incx, rval);
 }
 
 void idamaxsub( const f77_int* n, const double* x, const f77_int* incx, f77_int* rval)
 {
-    idamaxsub_( n, x, incx, rval);
+    idamaxsub_blis_impl( n, x, incx, rval);
 }
 
 void IDAMAXSUB_( const f77_int* n, const double* x, const f77_int* incx, f77_int* rval)
 {
-    idamaxsub_( n, x, incx, rval);
+    idamaxsub_blis_impl( n, x, incx, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 f77_int IDAMIN( const f77_int* n, const double* x, const f77_int* incx)
 {
-    return idamin_( n, x, incx);
+    return idamin_blis_impl( n, x, incx);
 }
 
 f77_int idamin( const f77_int* n, const double* x, const f77_int* incx)
 {
-    return idamin_( n, x, incx);
+    return idamin_blis_impl( n, x, incx);
 }
 
 f77_int IDAMIN_( const f77_int* n, const double* x, const f77_int* incx)
 {
-    return idamin_( n, x, incx);
+    return idamin_blis_impl( n, x, incx);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void IDAMINSUB(const f77_int* n, const double* x, const f77_int* incx, f77_int* rval)
 {
-    idaminsub_( n, x, incx, rval);
+    idaminsub_blis_impl( n, x, incx, rval);
 }
 
 void idaminsub(const f77_int* n, const double* x, const f77_int* incx, f77_int* rval)
 {
-    idaminsub_( n, x, incx, rval);
+    idaminsub_blis_impl( n, x, incx, rval);
 }
 
 void IDAMINSUB_(const f77_int* n, const double* x, const f77_int* incx, f77_int* rval)
 {
-    idaminsub_( n, x, incx, rval);
+    idaminsub_blis_impl( n, x, incx, rval);
 }
 
 void ISAMAXSUB( const f77_int* n, const float* x, const f77_int* incx, f77_int* rval)
 {
-    isamaxsub_( n, x, incx, rval);
+    isamaxsub_blis_impl( n, x, incx, rval);
 }
 
 void isamaxsub( const f77_int* n, const float* x, const f77_int* incx, f77_int* rval)
 {
-    isamaxsub_( n, x, incx, rval);
+    isamaxsub_blis_impl( n, x, incx, rval);
 }
 
 void ISAMAXSUB_( const f77_int* n, const float* x, const f77_int* incx, f77_int* rval)
 {
-    isamaxsub_( n, x, incx, rval);
+    isamaxsub_blis_impl( n, x, incx, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 f77_int ISAMIN( const f77_int* n, const float* x, const f77_int* incx)
 {
-    return isamin_( n, x, incx);
+    return isamin_blis_impl( n, x, incx);
 }
 
 f77_int isamin( const f77_int* n, const float* x, const f77_int* incx)
 {
-    return isamin_( n, x, incx);
+    return isamin_blis_impl( n, x, incx);
 }
 
 f77_int ISAMIN_( const f77_int* n, const float* x, const f77_int* incx)
 {
-    return isamin_( n, x, incx);
+    return isamin_blis_impl( n, x, incx);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void ISAMINSUB( const f77_int* n, const float* x, const f77_int* incx, f77_int* rval)
 {
-    isaminsub_( n, x, incx, rval);
+    isaminsub_blis_impl( n, x, incx, rval);
 }
 
 void isaminsub( const f77_int* n, const float* x, const f77_int* incx, f77_int* rval)
 {
-    isaminsub_( n, x, incx, rval);
+    isaminsub_blis_impl( n, x, incx, rval);
 }
 
 void ISAMINSUB_( const f77_int* n, const float* x, const f77_int* incx, f77_int* rval)
 {
-    isaminsub_( n, x, incx, rval);
+    isaminsub_blis_impl( n, x, incx, rval);
 }
 
 void IZAMAXSUB( const f77_int* n, const dcomplex* x, const f77_int* incx, f77_int* rval)
 {
-    izamaxsub_( n, x, incx, rval);
+    izamaxsub_blis_impl( n, x, incx, rval);
 }
 
 void izamaxsub( const f77_int* n, const dcomplex* x, const f77_int* incx, f77_int* rval)
 {
-    izamaxsub_( n, x, incx, rval);
+    izamaxsub_blis_impl( n, x, incx, rval);
 }
 
 void IZAMAXSUB_( const f77_int* n, const dcomplex* x, const f77_int* incx, f77_int* rval)
 {
-    izamaxsub_( n, x, incx, rval);
+    izamaxsub_blis_impl( n, x, incx, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 f77_int IZAMIN( const f77_int* n, const dcomplex* x, const f77_int* incx)
 {
-    return izamin_( n, x, incx);
+    return izamin_blis_impl( n, x, incx);
 }
 
 f77_int izamin( const f77_int* n, const dcomplex* x, const f77_int* incx)
 {
-    return izamin_( n, x, incx);
+    return izamin_blis_impl( n, x, incx);
 }
 
 f77_int IZAMIN_( const f77_int* n, const dcomplex* x, const f77_int* incx)
 {
-    return izamin_( n, x, incx);
+    return izamin_blis_impl( n, x, incx);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void IZAMINSUB( const f77_int* n, const dcomplex* x, const f77_int* incx, f77_int* rval)
 {
-    izaminsub_( n, x, incx, rval);
+    izaminsub_blis_impl( n, x, incx, rval);
 }
 
 void izaminsub( const f77_int* n, const dcomplex* x, const f77_int* incx, f77_int* rval)
 {
-    izaminsub_( n, x, incx, rval);
+    izaminsub_blis_impl( n, x, incx, rval);
 }
 
 void IZAMINSUB_( const f77_int* n, const dcomplex* x, const f77_int* incx, f77_int* rval)
 {
-    izaminsub_( n, x, incx, rval);
+    izaminsub_blis_impl( n, x, incx, rval);
 }
 
 void SASUMSUB( const f77_int* n, const float* x, const f77_int* incx, float* rval)
 {
-    sasumsub_( n, x, incx, rval);
+    sasumsub_blis_impl( n, x, incx, rval);
 }
 
 void sasumsub( const f77_int* n, const float* x, const f77_int* incx, float* rval)
 {
-    sasumsub_( n, x, incx, rval);
+    sasumsub_blis_impl( n, x, incx, rval);
 }
 
 void SASUMSUB_( const f77_int* n, const float* x, const f77_int* incx, float* rval)
 {
-    sasumsub_( n, x, incx, rval);
+    sasumsub_blis_impl( n, x, incx, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 void SAXPBY( const f77_int* n, const float* alpha, const float *x, const f77_int* incx, const float* beta, float *y, const f77_int* incy)
 {
-    saxpby_( n, alpha, x, incx, beta, y, incy);
+    saxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
 void saxpby( const f77_int* n, const float* alpha, const float *x, const f77_int* incx, const float* beta, float *y, const f77_int* incy)
 {
-    saxpby_( n, alpha, x, incx, beta, y, incy);
+    saxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
 void SAXPBY_( const f77_int* n, const float* alpha, const float *x, const f77_int* incx, const float* beta, float *y, const f77_int* incy)
 {
-    saxpby_( n, alpha, x, incx, beta, y, incy);
+    saxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void SCASUMSUB( const f77_int* n, const scomplex* x, const f77_int* incx, float* rval)
 {
-    scasumsub_( n, x, incx, rval);
+    scasumsub_blis_impl( n, x, incx, rval);
 }
 
 void scasumsub( const f77_int* n, const scomplex* x, const f77_int* incx, float* rval)
 {
-    scasumsub_( n, x, incx, rval);
+    scasumsub_blis_impl( n, x, incx, rval);
 }
 
 void SCASUMSUB_( const f77_int* n, const scomplex* x, const f77_int* incx, float* rval)
 {
-    scasumsub_( n, x, incx, rval);
+    scasumsub_blis_impl( n, x, incx, rval);
 }
 
 void SCNRM2SUB( const f77_int* n, const scomplex* x, const f77_int* incx, float* rval)
 {
-    scnrm2sub_( n, x, incx, rval);
+    scnrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void scnrm2sub( const f77_int* n, const scomplex* x, const f77_int* incx, float* rval)
 {
-    scnrm2sub_( n, x, incx, rval);
+    scnrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void SCNRM2SUB_( const f77_int* n, const scomplex* x, const f77_int* incx, float* rval)
 {
-    scnrm2sub_( n, x, incx, rval);
+    scnrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void SDOTSUB( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval)
 {
-    sdotsub_( n, x, incx, y, incy, rval);
+    sdotsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void sdotsub( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval)
 {
-    sdotsub_( n, x, incx, y, incy, rval);
+    sdotsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void SDOTSUB_( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* rval)
 {
-    sdotsub_( n, x, incx, y, incy, rval);
+    sdotsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 void SGEMM_BATCH(const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const float* alpha_array, const float** a_array, const  f77_int *lda_array, const float** b_array, const f77_int *ldb_array, const float* beta_array, float** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    sgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    sgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void sgemm_batch(const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const float* alpha_array, const float** a_array, const  f77_int *lda_array, const float** b_array, const f77_int *ldb_array, const float* beta_array, float** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    sgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    sgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void SGEMM_BATCH_(const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const float* alpha_array, const float** a_array, const  f77_int *lda_array, const float** b_array, const f77_int *ldb_array, const float* beta_array, float** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    sgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    sgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void SGEMMT( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  float* alpha, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc)
 {
-    sgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    sgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void sgemmt( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  float* alpha, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc)
 {
-    sgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    sgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void SGEMMT_( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  float* alpha, const float* a, const f77_int* lda, const float* b, const f77_int* ldb, const float* beta, float* c, const f77_int* ldc)
 {
-    sgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    sgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+//#ifdef BLIS_ENABLE_CBLAS
+
 void SIMATCOPY( f77_char* trans, f77_int* rows, f77_int* cols, const float* alpha,float* aptr, f77_int* lda, f77_int* ldb)
 {
     simatcopy_( trans, rows, cols, alpha, aptr, lda, ldb);
@@ -2948,21 +3013,29 @@ void SIMATCOPY_( f77_char* trans, f77_int* rows, f77_int* cols, const float* alp
     simatcopy_( trans, rows, cols, alpha, aptr, lda, ldb);
 }
 
+//#endif // BLIS_ENABLE_CBLAS
+
+#ifdef BLIS_ENABLE_CBLAS
+
 void SNRM2SUB( const f77_int* n, const float* x, const f77_int* incx, float *rval)
 {
-    snrm2sub_( n, x, incx, rval);
+    snrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void snrm2sub( const f77_int* n, const float* x, const f77_int* incx, float *rval)
 {
-    snrm2sub_( n, x, incx, rval);
+    snrm2sub_blis_impl( n, x, incx, rval);
 }
 
 void SNRM2SUB_( const f77_int* n, const float* x, const f77_int* incx, float *rval)
 {
-    snrm2sub_( n, x, incx, rval);
+    snrm2sub_blis_impl( n, x, incx, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
+//#ifdef BLIS_ENABLE_CBLAS
+
 void SOMATADD( f77_char* transa,f77_char* transb, f77_int* m, f77_int* n, const float* alpha, const float* A, f77_int* lda, const float* beta, const float* B, f77_int* ldb, float* C, f77_int* ldc)
 {
     somatadd_( transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
@@ -3008,96 +3081,104 @@ void SOMATCOPY_( f77_char* trans, f77_int* rows, f77_int* cols, const float* alp
     somatcopy_( trans, rows, cols, alpha, aptr, lda, bptr, ldb);
 }
 
+//#endif // BLIS_ENABLE_CBLAS
+
 void ZAXPBY( const f77_int* n, const dcomplex* alpha, const dcomplex *x, const f77_int* incx, const dcomplex* beta, dcomplex *y, const f77_int* incy)
 {
-    zaxpby_( n, alpha, x, incx, beta, y, incy);
+    zaxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
 void zaxpby( const f77_int* n, const dcomplex* alpha, const dcomplex *x, const f77_int* incx, const dcomplex* beta, dcomplex *y, const f77_int* incy)
 {
-    zaxpby_( n, alpha, x, incx, beta, y, incy);
+    zaxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
 void ZAXPBY_( const f77_int* n, const dcomplex* alpha, const dcomplex *x, const f77_int* incx, const dcomplex* beta, dcomplex *y, const f77_int* incy)
 {
-    zaxpby_( n, alpha, x, incx, beta, y, incy);
+    zaxpby_blis_impl( n, alpha, x, incx, beta, y, incy);
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void ZDOTCSUB( const f77_int* n, const dcomplex* x, const f77_int* incx, const dcomplex* y, const f77_int* incy, dcomplex* rval)
 {
-    zdotcsub_( n, x, incx, y, incy, rval);
+    zdotcsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void zdotcsub( const f77_int* n, const dcomplex* x, const f77_int* incx, const dcomplex* y, const f77_int* incy, dcomplex* rval)
 {
-    zdotcsub_( n, x, incx, y, incy, rval);
+    zdotcsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void ZDOTCSUB_( const f77_int* n, const dcomplex* x, const f77_int* incx, const dcomplex* y, const f77_int* incy, dcomplex* rval)
 {
-    zdotcsub_( n, x, incx, y, incy, rval);
+    zdotcsub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void ZDOTUSUB( const f77_int* n, const dcomplex* x, const f77_int* incx,const dcomplex* y, const f77_int* incy, dcomplex* rval)
 {
-    zdotusub_( n, x, incx, y, incy, rval);
+    zdotusub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void zdotusub( const f77_int* n, const dcomplex* x, const f77_int* incx,const dcomplex* y, const f77_int* incy, dcomplex* rval)
 {
-    zdotusub_( n, x, incx, y, incy, rval);
+    zdotusub_blis_impl( n, x, incx, y, incy, rval);
 }
 
 void ZDOTUSUB_( const f77_int* n, const dcomplex* x, const f77_int* incx,const dcomplex* y, const f77_int* incy, dcomplex* rval)
 {
-    zdotusub_( n, x, incx, y, incy, rval);
+    zdotusub_blis_impl( n, x, incx, y, incy, rval);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 void ZGEMM3M( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const dcomplex* alpha, const dcomplex* a, const f77_int* lda, const dcomplex* b, const f77_int* ldb, const dcomplex* beta, dcomplex* c, const f77_int* ldc)
 {
-    zgemm3m_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemm3m_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void zgemm3m( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const dcomplex* alpha, const dcomplex* a, const f77_int* lda, const dcomplex* b, const f77_int* ldb, const dcomplex* beta, dcomplex* c, const f77_int* ldc)
 {
-    zgemm3m_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemm3m_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZGEMM3M_( const f77_char* transa, const f77_char* transb, const f77_int* m, const f77_int* n, const f77_int* k, const dcomplex* alpha, const dcomplex* a, const f77_int* lda, const dcomplex* b, const f77_int* ldb, const dcomplex* beta, dcomplex* c, const f77_int* ldc)
 {
-    zgemm3m_( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemm3m_blis_impl( transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZGEMM_BATCH(  const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const dcomplex* alpha_array, const dcomplex** a_array, const  f77_int *lda_array, const dcomplex** b_array, const f77_int *ldb_array, const dcomplex* beta_array, dcomplex** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    zgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    zgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void zgemm_batch(  const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const dcomplex* alpha_array, const dcomplex** a_array, const  f77_int *lda_array, const dcomplex** b_array, const f77_int *ldb_array, const dcomplex* beta_array, dcomplex** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    zgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    zgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void ZGEMM_BATCH_(  const f77_char* transa_array, const f77_char* transb_array,const f77_int *m_array, const f77_int *n_array, const f77_int *k_array,const dcomplex* alpha_array, const dcomplex** a_array, const  f77_int *lda_array, const dcomplex** b_array, const f77_int *ldb_array, const dcomplex* beta_array, dcomplex** c_array, const f77_int *ldc_array, const f77_int* group_count, const f77_int *group_size)
 {
-    zgemm_batch_( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
+    zgemm_batch_blis_impl( transa_array, transb_array, m_array, n_array, k_array, alpha_array, a_array, lda_array, b_array, ldb_array, beta_array, c_array, ldc_array, group_count, group_size);
 }
 
 void ZGEMMT( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  dcomplex* alpha, const dcomplex* a, const f77_int* lda, const dcomplex* b, const f77_int* ldb, const dcomplex* beta, dcomplex* c, const f77_int* ldc)
 {
-    zgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void zgemmt( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  dcomplex* alpha, const dcomplex* a, const f77_int* lda, const dcomplex* b, const f77_int* ldb, const dcomplex* beta, dcomplex* c, const f77_int* ldc)
 {
-    zgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
 void ZGEMMT_( const f77_char* uploc, const f77_char* transa, const f77_char* transb, const f77_int* n, const f77_int* k, const  dcomplex* alpha, const dcomplex* a, const f77_int* lda, const dcomplex* b, const f77_int* ldb, const dcomplex* beta, dcomplex* c, const f77_int* ldc)
 {
-    zgemmt_( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    zgemmt_blis_impl( uploc, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
 }
 
+//#ifdef BLIS_ENABLE_CBLAS
+
 void ZIMATCOPY(f77_char* trans, f77_int* rows, f77_int* cols, const dcomplex* alpha,dcomplex* aptr, f77_int* lda, f77_int* ldb)
 {
     zimatcopy_( trans, rows, cols, alpha, aptr, lda, ldb);
@@ -3158,68 +3239,74 @@ void ZOMATCOPY_(f77_char* trans, f77_int* rows, f77_int* cols, const dcomplex* a
     zomatcopy_( trans, rows, cols, alpha, aptr, lda, bptr, ldb);
 }
 
-
+//#endif // BLIS_ENABLE_CBLAS
 
 float SCABS1(bla_scomplex* z)
 {
-    return scabs1_( z);
+    return scabs1_blis_impl( z);
 }
 
 float scabs1(bla_scomplex* z)
 {
-    return scabs1_( z);
+    return scabs1_blis_impl( z);
 }
 
 float SCABS1_(bla_scomplex* z)
 {
-    return scabs1_( z);
+    return scabs1_blis_impl( z);
 
 }
 
+#ifdef BLIS_ENABLE_CBLAS
+
 void SDSDOTSUB( const f77_int* n, float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* dot)
 {
-    sdsdotsub_( n, sb, x, incx, y, incy, dot);
+    sdsdotsub_blis_impl( n, sb, x, incx, y, incy, dot);
 }
 
 void sdsdotsub( const f77_int* n, float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* dot)
 {
-    sdsdotsub_( n, sb, x, incx, y, incy, dot);
+    sdsdotsub_blis_impl( n, sb, x, incx, y, incy, dot);
 }
 
 void SDSDOTSUB_( const f77_int* n, float* sb, const float* x, const f77_int* incx, const float* y, const f77_int* incy, float* dot)
 {
-    sdsdotsub_( n, sb, x, incx, y, incy, dot);
+    sdsdotsub_blis_impl( n, sb, x, incx, y, incy, dot);
 }
 
 void DSDOTSUB( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* dot)
 {
-    dsdotsub_( n, x, incx, y, incy, dot);
+    dsdotsub_blis_impl( n, x, incx, y, incy, dot);
 }
 
 void dsdotsub( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* dot)
 {
-    dsdotsub_( n, x, incx, y, incy, dot);
+    dsdotsub_blis_impl( n, x, incx, y, incy, dot);
 }
 
 void DSDOTSUB_( const f77_int* n, const float* x, const f77_int* incx, const float* y, const f77_int* incy, double* dot)
 {
-    dsdotsub_( n, x, incx, y, incy, dot);
+    dsdotsub_blis_impl( n, x, incx, y, incy, dot);
 }
 
+#endif // BLIS_ENABLE_CBLAS
+
 void CAXPBY( const f77_int* n,  const scomplex* alpha,  const scomplex *x,  const f77_int* incx,  const scomplex* beta,  scomplex *y,  const f77_int* incy)
 {
-    caxpby_(n, alpha, x, incx, beta, y, incy);
+    caxpby_blis_impl(n, alpha, x, incx, beta, y, incy);
 }
 
 void caxpby( const f77_int* n,  const scomplex* alpha,  const scomplex *x,  const f77_int* incx,  const scomplex* beta,  scomplex *y,  const f77_int* incy)
 {
-    caxpby_(n, alpha, x, incx, beta, y, incy);
+    caxpby_blis_impl(n, alpha, x, incx, beta, y, incy);
 }
 
 void CAXPBY_( const f77_int* n,  const scomplex* alpha,  const scomplex *x,  const f77_int* incx,  const scomplex* beta,  scomplex *y,  const f77_int* incy)
 {
-    caxpby_(n, alpha, x, incx, beta, y, incy);
+    caxpby_blis_impl(n, alpha, x, incx, beta, y, incy);
 }
 
 #endif
 #endif
+
+#endif // BLIS_ENABLE_BLAS
diff --git a/frame/util/bli_util_api_wrap.h b/frame/util/bli_util_api_wrap.h
index 86471c76f6..7f458316d0 100644
--- a/frame/util/bli_util_api_wrap.h
+++ b/frame/util/bli_util_api_wrap.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -32,6 +32,8 @@
 
 */
 
+#ifdef BLIS_ENABLE_BLAS
+
 // file define different formats of BLAS APIs- uppercase with
 // and without underscore, lowercase without underscore.
 
@@ -1087,6 +1089,14 @@ BLIS_EXPORT_BLIS void DGEMM_(const char   *transa, const char   *transb, const f
 
 
 
+BLIS_EXPORT_BLIS void DZGEMM( const f77_char *transa, const f77_char *transb, const f77_int *m, const f77_int *n, const f77_int *k, const dcomplex *alpha, const double *a, const f77_int *lda, const dcomplex *b, const f77_int *ldb, const dcomplex *beta, dcomplex *c, const f77_int *ldc );
+
+BLIS_EXPORT_BLIS void dzgemm( const f77_char *transa, const f77_char *transb, const f77_int *m, const f77_int *n, const f77_int *k, const dcomplex *alpha, const double *a, const f77_int *lda, const dcomplex *b, const f77_int *ldb, const dcomplex *beta, dcomplex *c, const f77_int *ldc );
+
+BLIS_EXPORT_BLIS void DZGEMM_( const f77_char *transa, const f77_char *transb, const f77_int *m, const f77_int *n, const f77_int *k, const dcomplex *alpha, const double *a, const f77_int *lda, const dcomplex *b, const f77_int *ldb, const dcomplex *beta, dcomplex *c, const f77_int *ldc );
+
+
+
 BLIS_EXPORT_BLIS void DSYMM(const char   *side, const char   *uplo, const f77_int *m, const f77_int *n, const double *alpha, const double *a, const f77_int *lda, const double *b, const f77_int *ldb, const double *beta, double *c, const f77_int *ldc);
 
 BLIS_EXPORT_BLIS void dsymm(const char   *side, const char   *uplo, const f77_int *m, const f77_int *n, const double *alpha, const double *a, const f77_int *lda, const double *b, const f77_int *ldb, const double *beta, double *c, const f77_int *ldc);
@@ -1272,6 +1282,9 @@ BLIS_EXPORT_BLIS void ZTRSM_(const char   *side, const char   *uplo, const char
 
 
 // Miscellaneous APIs
+
+#ifdef BLIS_ENABLE_CBLAS
+
 BLIS_EXPORT_BLIS void CDOTCSUB( const f77_int* n,  const scomplex* x, const f77_int* incx,  const scomplex* y,  const f77_int* incy,  scomplex* rval);
 
 BLIS_EXPORT_BLIS void cdotcsub( const f77_int* n,  const scomplex* x, const f77_int* incx,  const scomplex* y,  const f77_int* incy,  scomplex* rval);
@@ -1462,6 +1475,7 @@ BLIS_EXPORT_BLIS void dsdotsub( const f77_int* n,  const float* x,  const f77_in
 
 BLIS_EXPORT_BLIS void DSDOTSUB_( const f77_int* n,  const float* x,  const f77_int* incx,  const float* y,  const f77_int* incy,  double* dot);
 
+#endif // BLIS_ENABLE_CBLAS
 
 
 BLIS_EXPORT_BLIS f77_int LSAME(const char   *ca, const char   *cb, const f77_int a, const f77_int b);
@@ -1609,6 +1623,7 @@ BLIS_EXPORT_BLIS void zgemmt( const f77_char* uploc,  const f77_char* transa,  c
 BLIS_EXPORT_BLIS void ZGEMMT_( const f77_char* uploc,  const f77_char* transa,  const f77_char* transb,  const f77_int* n,  const f77_int* k,  const  dcomplex* alpha,  const dcomplex* a,  const f77_int* lda,  const dcomplex* b,  const f77_int* ldb,  const dcomplex* beta,  dcomplex* c,  const f77_int* ldc);
 
 
+//#ifdef BLIS_ENABLE_CBLAS
 
 BLIS_EXPORT_BLIS void CIMATCOPY(f77_char* trans,  f77_int* rows,  f77_int* cols,  const scomplex* alpha, scomplex* aptr,  f77_int* lda,  f77_int* ldb);
 
@@ -1728,6 +1743,9 @@ BLIS_EXPORT_BLIS void zomatcopy(f77_char* trans,  f77_int* rows,  f77_int* cols,
 
 BLIS_EXPORT_BLIS void ZOMATCOPY_(f77_char* trans,  f77_int* rows,  f77_int* cols,  const dcomplex* alpha,  const dcomplex* aptr,  f77_int* lda,  dcomplex* bptr,  f77_int* ldb);
 
+//#endif // BLIS_ENABLE_CBLAS
 
 #endif
 #endif
+
+#endif // BLIS_ENABLE_BLAS
diff --git a/frame/util/bli_util_progress.c b/frame/util/bli_util_progress.c
index 4097eb1126..77374b4c3d 100644
--- a/frame/util/bli_util_progress.c
+++ b/frame/util/bli_util_progress.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -44,13 +44,14 @@ BLIS_TLS_TYPE dim_t tls_aoclprogress_counter;
 // update freqency.
 BLIS_TLS_TYPE dim_t tls_aoclprogress_last_update;
 
-
 // AOCL_progress_ptr contains the pointer to the callback function
-// By default it is set to NULL, which effectivly disabled the 
-// progress feature. 
-AOCL_progress_callback AOCL_progress_ptr = NULL;
+// By default it is set to NULL, which effectively disabled the
+// progress feature.
+// AOCL_progress_ptr can be updated by any of the thread outside blis
+// hence volatile keyword is being used to warn compiler not to optimise
+volatile AOCL_progress_callback AOCL_progress_ptr = NULL;
 
 void AOCL_BLIS_set_progress(AOCL_progress_callback func)
 {
     AOCL_progress_ptr = func;
-}
\ No newline at end of file
+}
diff --git a/frame/util/bli_util_progress.h b/frame/util/bli_util_progress.h
index ed7a79cb66..ea094739d9 100644
--- a/frame/util/bli_util_progress.h
+++ b/frame/util/bli_util_progress.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -47,7 +47,9 @@ BLIS_EXPORT_BLIS void AOCL_BLIS_set_progress(AOCL_progress_callback func);
 
 // Private interfaces for internal use
 
-extern AOCL_progress_callback AOCL_progress_ptr;
+// AOCL_progress_ptr can be updated by any of the thread outside blis
+// hence volatile keyword is being used to warn compiler not optimise
+extern volatile AOCL_progress_callback AOCL_progress_ptr;
 
 extern BLIS_TLS_TYPE dim_t tls_aoclprogress_counter;
 extern BLIS_TLS_TYPE dim_t tls_aoclprogress_last_update;
@@ -57,18 +59,4 @@ extern BLIS_TLS_TYPE dim_t tls_aoclprogress_last_update;
 // elements are processed in the current thread.
 #define AOCL_PROGRESS_FREQUENCY 1e+9
 
-#define MAX_API_NAME_LEN 20
-
-// Macro to send update using datatype character and the api name
-#define AOCL_PROGRESS_DT(dt, api, progress, tid, nt) \
-        char buf[MAX_API_NAME_LEN]; \
-        snprintf(buf, MAX_API_NAME_LEN, "%c%s", dt, api); \
-        (*AOCL_progress_ptr) (buf, strlen(buf), progress, tid, nt); \
-
-// Macro to send update using api name alone.
-#define AOCL_PROGRESS_NAME(api, progress, tid, nt) \
-        char buf[MAX_API_NAME_LEN]; \
-        snprintf(buf, MAX_API_NAME_LEN, "%s", dt, api); \
-        (*AOCL_progress_ptr) (buf, strlen(buf), progress, tid, nt); \
-
 #endif // BLI_UTIL_PROGRESS_H
diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index 78c4c9198d..e090eb090e 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -307,55 +307,113 @@ void PASTEMAC(ch,varname) \
 }
 
 //INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
-GENTFUNCR( scomplex, float,  c, s, normfv_unb_var1, sumsqv_unb_var1 )
+//GENTFUNCR( scomplex, float,  c, s, normfv_unb_var1, sumsqv_unb_var1 )
+void bli_cnormfv_unb_var1
+    (
+        dim_t    n,
+        scomplex*   x,
+        inc_t incx,
+        float* norm,
+        cntx_t*  cntx,
+        rntm_t*  rntm
+    )
+{
+    arch_t id = bli_arch_query_id();
+    switch (id)
+    {
+        case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN3:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN:
+#ifdef BLIS_KERNELS_ZEN
+            bli_scnorm2fv_unb_var1_avx2( n, x, incx, norm, cntx );
+            break;
+#endif
+        default:;
+            float* zero       = bli_s0;
+            float* one        = bli_s1;
+            float  scale;
+            float  sumsq;
+            float  sqrt_sumsq;
+
+            // Initialize scale and sumsq to begin the summation.
+            bli_scopys( *zero, scale );
+            bli_scopys( *one,  sumsq );
+
+            // Compute the sum of the squares of the vector.
+
+            bli_csumsqv_unb_var1
+            (
+                n,
+                x,
+                incx,
+                &scale,
+                &sumsq,
+                cntx,
+                rntm
+            );
+
+            // Compute: norm = scale * sqrt( sumsq )
+            bli_ssqrt2s( sumsq, sqrt_sumsq );
+            bli_sscals( scale, sqrt_sumsq );
+
+            // Store the final value to the output variable.
+            bli_scopys( sqrt_sumsq, *norm );
+    }
+}
 
 void bli_znormfv_unb_var1
-    ( 
-        dim_t    n, 
-        dcomplex*   x, 
-        inc_t incx, 
-        double* norm, 
-        cntx_t*  cntx, 
-        rntm_t*  rntm  
-    ) 
-{ 
-   
-   if ( bli_cpuid_is_avx_supported() == TRUE )
-   {
-        bli_dznorm2fv_unb_var1_avx2( n, x, incx, norm, cntx );
-   }
-   else
-   {
-        double* zero       = bli_d0;
-        double* one        = bli_d1;
-        double  scale; 
-        double  sumsq; 
-        double  sqrt_sumsq; 
-
-        // Initialize scale and sumsq to begin the summation.
-        bli_dcopys( *zero, scale ); 
-        bli_dcopys( *one,  sumsq ); 
-
-        // Compute the sum of the squares of the vector.
-
-        bli_zsumsqv_unb_var1 
-        ( 
-            n,
-            x, 
-            incx,
-            &scale,
-            &sumsq,
-            cntx,
-            rntm
-        );
-
-        // Compute: norm = scale * sqrt( sumsq ) 
-        bli_dsqrt2s( sumsq, sqrt_sumsq ); 
-        bli_dscals( scale, sqrt_sumsq ); 
-
-        // Store the final value to the output variable.
-        bli_dcopys( sqrt_sumsq, *norm );
-   }
+    (
+        dim_t    n,
+        dcomplex*   x,
+        inc_t incx,
+        double* norm,
+        cntx_t*  cntx,
+        rntm_t*  rntm
+    )
+{
+    arch_t id = bli_arch_query_id();
+    switch (id)
+    {
+        case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN3:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN:
+#ifdef BLIS_KERNELS_ZEN
+            bli_dznorm2fv_unb_var1_avx2( n, x, incx, norm, cntx );
+            break;
+#endif
+        default:;
+            double* zero       = bli_d0;
+            double* one        = bli_d1;
+            double  scale;
+            double  sumsq;
+            double  sqrt_sumsq;
+
+            // Initialize scale and sumsq to begin the summation.
+            bli_dcopys( *zero, scale );
+            bli_dcopys( *one,  sumsq );
+
+            // Compute the sum of the squares of the vector.
+
+            bli_zsumsqv_unb_var1
+            (
+                n,
+                x,
+                incx,
+                &scale,
+                &sumsq,
+                cntx,
+                rntm
+            );
+
+            // Compute: norm = scale * sqrt( sumsq )
+            bli_dsqrt2s( sumsq, sqrt_sumsq );
+            bli_dscals( scale, sqrt_sumsq );
+
+            // Store the final value to the output variable.
+            bli_dcopys( sqrt_sumsq, *norm );
+    }
 }
 
 #undef  GENTFUNCR
@@ -487,55 +545,113 @@ void PASTEMAC(ch,varname) \
     PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
 }
 #endif
-GENTFUNCR( float,   float,  s, s, normfv_unb_var1, sumsqv_unb_var1 )
-
-void bli_dnormfv_unb_var1
-    ( 
-        dim_t    n, 
-        double*   x, 
-        inc_t incx, 
-        double* norm, 
-        cntx_t*  cntx, 
-        rntm_t*  rntm  
-    ) 
-{ 
-   
-   if( bli_cpuid_is_avx_supported() == TRUE )
-   {
-        bli_dnorm2fv_unb_var1_avx2( n, x, incx, norm, cntx );
-   }
-   else
-   {
-        double* zero       = bli_d0;
-        double* one        = bli_d1;
-        double  scale; 
-        double  sumsq; 
-        double  sqrt_sumsq; 
+//GENTFUNCR( float,   float,  s, s, normfv_unb_var1, sumsqv_unb_var1 )
+
+void bli_snormfv_unb_var1
+    (
+        dim_t    n,
+        float*   x,
+        inc_t incx,
+        float* norm,
+        cntx_t*  cntx,
+        rntm_t*  rntm
+    )
+{
+    // Early return if n=1.
+    if ( n == 1 )
+    {
+        *norm = bli_fabs(*x);
+        return;
+    }
+    /* Disable AVX2 codepath.
+    if( bli_cpuid_is_avx2fma3_supported() == TRUE )
+    {
+        bli_snorm2fv_unb_var1_avx2( n, x, incx, norm, cntx );
+    }
+    else*/
+    {
+        float* zero       = bli_s0;
+        float* one        = bli_s1;
+        float  scale;
+        float  sumsq;
+        float  sqrt_sumsq;
 
         // Initialize scale and sumsq to begin the summation.
-        bli_ddcopys( *zero, scale ); 
-        bli_ddcopys( *one,  sumsq ); 
+        bli_sscopys( *zero, scale );
+        bli_sscopys( *one,  sumsq );
 
         // Compute the sum of the squares of the vector.
-
-        bli_dsumsqv_unb_var1 
-        ( 
-        n,
-        x, 
-        incx,
-        &scale,
-        &sumsq,
-        cntx,
-        rntm
+        bli_ssumsqv_unb_var1
+        (
+            n,
+            x,
+            incx,
+            &scale,
+            &sumsq,
+            cntx,
+            rntm
         );
 
-        // Compute: norm = scale * sqrt( sumsq ) 
-        bli_dsqrt2s( sumsq, sqrt_sumsq ); 
-        bli_dscals( scale, sqrt_sumsq ); 
+        // Compute: norm = scale * sqrt( sumsq )
+        bli_ssqrt2s( sumsq, sqrt_sumsq );
+        bli_sscals( scale, sqrt_sumsq );
 
         // Store the final value to the output variable.
-        bli_dcopys( sqrt_sumsq, *norm );
-   }
+        bli_scopys( sqrt_sumsq, *norm );
+    }
+}
+
+void bli_dnormfv_unb_var1
+    (
+        dim_t    n,
+        double*   x,
+        inc_t incx,
+        double* norm,
+        cntx_t*  cntx,
+        rntm_t*  rntm
+    )
+{
+    arch_t id = bli_arch_query_id();
+    switch (id)
+    {
+        case BLIS_ARCH_ZEN4:
+        case BLIS_ARCH_ZEN3:
+        case BLIS_ARCH_ZEN2:
+        case BLIS_ARCH_ZEN:
+#ifdef BLIS_KERNELS_ZEN
+            bli_dnorm2fv_unb_var1_avx2( n, x, incx, norm, cntx );
+            break;
+#endif
+        default:;
+            double* zero       = bli_d0;
+            double* one        = bli_d1;
+            double  scale;
+            double  sumsq;
+            double  sqrt_sumsq;
+
+            // Initialize scale and sumsq to begin the summation.
+            bli_ddcopys( *zero, scale );
+            bli_ddcopys( *one,  sumsq );
+
+            // Compute the sum of the squares of the vector.
+            bli_dsumsqv_unb_var1 
+            (
+                n,
+                x,
+                incx,
+                &scale,
+                &sumsq,
+                cntx,
+                rntm
+            );
+
+            // Compute: norm = scale * sqrt( sumsq )
+            bli_dsqrt2s( sumsq, sqrt_sumsq );
+            bli_dscals( scale, sqrt_sumsq );
+
+            // Store the final value to the output variable.
+            bli_dcopys( sqrt_sumsq, *norm );
+    }
 }
 
 #undef  GENTFUNCR
@@ -1233,79 +1349,115 @@ void PASTEMAC(ch,varname) \
        rntm_t*  rntm  \
      ) \
 { \
-    const ctype_r zero_r = *PASTEMAC(chr,0); \
-    const ctype_r one_r  = *PASTEMAC(chr,1); \
-\
-    ctype*        chi1; \
-    ctype_r       chi1_r; \
-    ctype_r       chi1_i; \
-    ctype_r       scale_r; \
-    ctype_r       sumsq_r; \
-    ctype_r       abs_chi1_r; \
-    dim_t         i; \
-\
-    /* NOTE: This function attempts to mimic the algorithm for computing
-       the Frobenius norm in netlib LAPACK's ?lassq(). */ \
-\
-    /* Copy scale and sumsq to local variables. */ \
-    PASTEMAC(chr,copys)( *scale, scale_r ); \
-    PASTEMAC(chr,copys)( *sumsq, sumsq_r ); \
-\
-    chi1 = x; \
-\
-    for ( i = 0; i < n; ++i ) \
-    { \
-        /* Get the real and imaginary components of chi1. */ \
-        PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
-\
-        abs_chi1_r = bli_fabs( chi1_r ); \
-\
-        /* Accumulate real component into sumsq, adjusting scale if
-           needed. */ \
-        if ( abs_chi1_r > zero_r || bli_isnan( abs_chi1_r) ) \
-        { \
-            if ( scale_r < abs_chi1_r ) \
-            { \
-                sumsq_r = one_r + \
-                          sumsq_r * ( scale_r / abs_chi1_r ) * \
-                                    ( scale_r / abs_chi1_r );  \
-\
-                PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \
-            } \
-            else \
-            { \
-                sumsq_r = sumsq_r + ( abs_chi1_r / scale_r ) * \
-                                    ( abs_chi1_r / scale_r );  \
-            } \
-        } \
-\
-        abs_chi1_r = bli_fabs( chi1_i ); \
-\
-        /* Accumulate imaginary component into sumsq, adjusting scale if
-           needed. */ \
-        if ( abs_chi1_r > zero_r || bli_isnan( abs_chi1_r) ) \
-        { \
-            if ( scale_r < abs_chi1_r ) \
-            { \
-                sumsq_r = one_r + \
-                          sumsq_r * ( scale_r / abs_chi1_r ) * \
-                                    ( scale_r / abs_chi1_r );  \
-\
-                PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \
-            } \
-            else \
-            { \
-                sumsq_r = sumsq_r + ( abs_chi1_r / scale_r ) * \
-                                    ( abs_chi1_r / scale_r );  \
-            } \
-        } \
-\
-        chi1 += incx; \
-    } \
-\
-    /* Store final values of scale and sumsq to output variables. */ \
-    PASTEMAC(chr,copys)( scale_r, *scale ); \
-    PASTEMAC(chr,copys)( sumsq_r, *sumsq ); \
+	ctype_r zero_r = *PASTEMAC(chr,0); \
+	ctype_r one_r  = *PASTEMAC(chr,1); \
+\
+	ctype*  chi1; \
+	ctype_r chi1_r; \
+	ctype_r chi1_i; \
+	ctype_r scale_r; \
+	ctype_r sumsq_r; \
+	ctype_r abs_chi1_r; \
+	ctype_r abs_chi1_i; \
+	dim_t   i; \
+\
+	/* NOTE: This function attempts to mimic the algorithm for computing
+	   the Frobenius norm in netlib LAPACK's ?lassq(). */ \
+\
+	/* Copy scale and sumsq to local variables. */ \
+	PASTEMAC(chr,copys)( *scale, scale_r ); \
+	PASTEMAC(chr,copys)( *sumsq, sumsq_r ); \
+\
+	chi1 = x; \
+\
+	for ( i = 0; i < n; ++i ) \
+	{ \
+		/* Get the real and imaginary components of chi1. */ \
+		PASTEMAC2(ch,chr,gets)( *chi1, chi1_r, chi1_i ); \
+\
+		abs_chi1_r = bli_fabs( chi1_r ); \
+		abs_chi1_i = bli_fabs( chi1_i ); \
+\
+		if ( bli_isnan( abs_chi1_r ) ) \
+		{ \
+			sumsq_r = abs_chi1_r; \
+			scale_r = one_r; \
+		} \
+\
+		if ( bli_isnan( abs_chi1_i ) ) \
+		{ \
+			sumsq_r = abs_chi1_i; \
+			scale_r = one_r; \
+		} \
+\
+		if ( bli_isnan( sumsq_r ) ) \
+		{ \
+			chi1 += incx; \
+			continue; \
+		} \
+\
+		if ( bli_isinf( abs_chi1_r ) ) \
+		{ \
+			sumsq_r = abs_chi1_r; \
+			scale_r = one_r; \
+		} \
+\
+		if ( bli_isinf( abs_chi1_i ) ) \
+		{ \
+			sumsq_r = abs_chi1_i; \
+			scale_r = one_r; \
+		} \
+\
+		if ( bli_isinf( sumsq_r ) ) \
+		{ \
+			chi1 += incx; \
+			continue; \
+		} \
+\
+		/* Accumulate real component into sumsq, adjusting scale if
+		   needed. */ \
+		if ( abs_chi1_r > zero_r ) \
+		{ \
+			if ( scale_r < abs_chi1_r ) \
+			{ \
+				sumsq_r = one_r + \
+				          sumsq_r * ( scale_r / abs_chi1_r ) * \
+				                    ( scale_r / abs_chi1_r );  \
+\
+				PASTEMAC(chr,copys)( abs_chi1_r, scale_r ); \
+			} \
+			else \
+			{ \
+				sumsq_r = sumsq_r + ( abs_chi1_r / scale_r ) * \
+				                    ( abs_chi1_r / scale_r );  \
+			} \
+		} \
+\
+		/* Accumulate imaginary component into sumsq, adjusting scale if
+		   needed. */ \
+		if ( abs_chi1_i > zero_r ) \
+		{ \
+			if ( scale_r < abs_chi1_i ) \
+			{ \
+				sumsq_r = one_r + \
+				          sumsq_r * ( scale_r / abs_chi1_i ) * \
+				                    ( scale_r / abs_chi1_i );  \
+\
+				PASTEMAC(chr,copys)( abs_chi1_i, scale_r ); \
+			} \
+			else \
+			{ \
+				sumsq_r = sumsq_r + ( abs_chi1_i / scale_r ) * \
+				                    ( abs_chi1_i / scale_r );  \
+			} \
+		} \
+\
+		chi1 += incx; \
+	} \
+\
+	/* Store final values of scale and sumsq to output variables. */ \
+	PASTEMAC(chr,copys)( scale_r, *scale ); \
+	PASTEMAC(chr,copys)( sumsq_r, *sumsq ); \
 }
 
 INSERT_GENTFUNCR_BASIC0( sumsqv_unb_var1 )
diff --git a/gtestsuite/CMakeLists.txt b/gtestsuite/CMakeLists.txt
new file mode 100644
index 0000000000..9f1b132a7d
--- /dev/null
+++ b/gtestsuite/CMakeLists.txt
@@ -0,0 +1,253 @@
+#[=[
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+]=]
+
+cmake_minimum_required(VERSION 3.14.0)
+set(CMAKE_CXX_COMPILER ${CXX_COMPILER})
+set(CMAKE_CXX_STANDARD 17)
+
+project(BLIS_GtestSuite)
+
+enable_testing()
+
+# Set the path to the BLIS installation.
+if(NOT(BLIS_PATH))
+    message(FATAL_ERROR "Need to provide a BLIS installation path during CMake invocation. Please use \
+    $ cmake .. -DBLIS_PATH=/home/username/blis_installation")
+endif()
+# Set the path to BLIS include directory.
+set(BLIS_INCLUDE ${BLIS_PATH}/include/blis)
+
+# Set OpenMP as the default option
+set(ENABLE_THREADING "openmp" CACHE STRING "Setting OpenMP as the threading library")
+# Set the possible values of theading libraries for cmake-gui
+set_property(CACHE ENABLE_THREADING PROPERTY STRINGS "openmp" "pthreads" "no")
+
+# Set static BLIS as the default library we build against.
+set(BLIS_LINKING_TYPE "static" CACHE STRING "Linking to a static BLIS library")
+# Set the possible values of BLIS linking type for cmake-gui
+set_property(CACHE BLIS_LINKING_TYPE PROPERTY STRINGS "static" "shared")
+
+option(ENABLE_ASAN "Run tests using Address Sanatizer" OFF)
+
+option(ENABLE_COVERAGE "Run tests for Code Coderage" OFF)
+
+# Set variable if the platform is Linux based.
+if(UNIX AND NOT APPLE)
+    set(LINUX TRUE)
+endif()
+
+# Throw an error if the platform is Apple.
+if(APPLE)
+    message(FATAL_ERROR "Build system does not support Apple platform.")
+endif()
+
+# Use INT_SIZE to set the int type used for testing.
+set(INT_SIZE "32" CACHE STRING "Library used to compute reference results.")
+# Set the possible values of reference CBLAS for cmake-gui
+set_property(CACHE INT_SIZE PROPERTY STRINGS "32" "64")
+if( NOT ((INT_SIZE STREQUAL "32") OR (INT_SIZE STREQUAL "64")) )
+    message(FATAL_ERROR "INT_SIZE option ${INT_SIZE} is not supported. Must be 32 or 64.")
+endif()
+
+# Use REF_BLAS to set the library that will be used for reference results.
+set(REF_CBLAS CACHE STRING "Library used to compute reference results.")
+# Set the possible values of reference CBLAS for cmake-gui
+set_property(CACHE REF_CBLAS PROPERTY STRINGS "OpenBLAS" "Netlib" "MKL")
+
+
+if(REF_LIB)
+    set(REFLIB_PATH ${REF_LIB}/..)
+    find_library(reflib NAMES openblas cblas mkl_intel_lp64 mkl_intel_ilp64 PATHS ${REFLIB_PATH})
+    if(${reflib} STREQUAL reflib-NOTFOUND)
+        message(FATAL_ERROR "Reference Library not found : " ${REF_LIB})
+    else()
+        message(STATUS "Found Reference Library : " ${reflib})
+    endif()
+    message( "Setting REF_LIB to ${REF_LIB}")
+else()
+    if(REF_CBLAS STREQUAL "OpenBLAS")
+        if(NOT(OPENBLAS_PATH))
+            message(FATAL_ERROR "Need to provide an OpenBLAS installation path \
+            during CMake invokation when OpenBLAS is used for reference results. Please use \
+            $ cmake .. -DOPENBLAS_PATH=/home/username/openblas_installation")
+        endif()
+        find_library(reflib NAMES openblas PATHS ${OPENBLAS_PATH})
+        if(${reflib} STREQUAL reflib-NOTFOUND)
+            message(FATAL_ERROR "OpenBLAS Reference Library not found : " ${OPENBLAS_PATH})
+        else()
+            message(STATUS "Found OpenBLAS Reference Library : " ${reflib})
+        endif()
+        set(REF_LIB ${reflib})
+    elseif(REF_CBLAS STREQUAL "Netlib")
+        if(NOT(NETLIB_PATH))
+            message(FATAL_ERROR "Need to provide a Netlib installation path \
+            during CMake invokation when Netlib is used for reference results. Please use \
+            $ cmake .. -DNETLIB_PATH=/home/username/netlib_installation")
+        endif()
+        if(INT_SIZE STREQUAL "32")
+            find_library(netlib NAMES cblas PATHS ${NETLIB_PATH})
+        else()
+            find_library(netlib NAMES cblas64 PATHS ${NETLIB_PATH})
+        endif()
+        if(${netlib} STREQUAL netlib-NOTFOUND)
+            message(FATAL_ERROR "Netlib Reference Library not found : "  ${NETLIB_PATH})
+        else()
+            message(STATUS "Found Netlib Reference Library : "  ${netlib})
+        endif()
+        set(REF_LIB ${netlib})
+    elseif(REF_CBLAS STREQUAL "MKL")
+        set(MKL_PATH $ENV{MKLROOT}/lib/intel64
+                CACHE STRING "The path to MKL.")
+        if(INT_SIZE STREQUAL "32")
+            find_library(mkllib NAMES mkl_intel_lp64 PATHS ${MKL_PATH})
+        else()
+            find_library(mkllib NAMES mkl_intel_ilp64 PATHS ${MKL_PATH})
+        endif()
+        if(${mkllib} STREQUAL mkllib-NOTFOUND)
+            message(FATAL_ERROR "MKL Reference Library not found : " ${MKL_PATH})
+        else()
+            message(STATUS "Found MKL Reference Library  : " ${mkllib})
+        endif()
+        set(REF_LIB ${mkllib})
+        find_library(mklcore NAMES mkl_core PATHS ${MKL_PATH})
+        if(${mklcore} STREQUAL mklcore-NOTFOUND)
+            message(FATAL_ERROR "MKL_CORE Library not found : " ${MKL_PATH})
+        else()
+            message(STATUS "Found MKL_CORE Library       : " ${mklcore})
+        endif()
+        set(MKL_CORE_PATH ${mklcore})
+        find_library(mklthread NAMES mkl_gnu_thread PATHS ${MKL_PATH})
+        if(${mklthread} STREQUAL mklthread-NOTFOUND)
+            message(FATAL_ERROR "MKL_GNU_THREAD Library not found : " ${MKL_PATH})
+        else()
+            message(STATUS "Found MKL_GNU_THREAD Library : " ${mklthread})
+        endif()
+        set(MKL_GNU_THREAD_PATH ${mklthread})
+    else()
+        message(FATAL_ERROR "Need to set up a reference library. Please use on of the following options \
+                during CMake invokation: -DREF_CBLAS=Netlib or -DREF_CBLAS=OpenBLAS or -DREF_CBLAS=MKL")
+    endif()
+endif()
+
+# Use TEST_INTERFACE to set which interface, supported by BLIS is meant to be tested.
+set(TEST_INTERFACE "BLAS" CACHE STRING "Interface that is being tested.")
+# Set the possible values of interfaces for cmake-gui
+set_property(CACHE TEST_INTERFACE PROPERTY STRINGS "BLAS" "CBLAS" "BLIS_TYPED")
+if( NOT ((TEST_INTERFACE STREQUAL "BLAS") OR (TEST_INTERFACE STREQUAL "CBLAS") OR (TEST_INTERFACE STREQUAL "BLIS_TYPED")) )
+    message(FATAL_ERROR "TEST_INTERFACE option ${TEST_INTERFACE} is not supported. Please use on of the following options \
+            during CMake invokation: -DTEST_INTERFACE=BLAS or -DTEST_INTERFACE=CBLAS or -DTEST_INTERFACE=BLIS_TYPED")
+endif()
+
+# Use ELEMENT_TYPE to set whether the elements of any matrix/vector tested are integers or floating point values.
+set(ELEMENT_TYPE "f" CACHE STRING "Type of elements of matrices/vectors")
+# Set the possible values of element types for cmake-gui
+set_property(CACHE ELEMENT_TYPE PROPERTY STRINGS "f" "i")
+if( NOT ((ELEMENT_TYPE STREQUAL "f") OR (ELEMENT_TYPE STREQUAL "i")) )
+    message(FATAL_ERROR "ELEMENT_TYPE option ${ELEMENT_TYPE} is not supported. Please use on of the following options \
+            during CMake invokation: -DELEMENT_TYPE=f or -DELEMENT_TYPE=i")
+endif()
+
+# Set common libraries.
+set(COMMON_LIBS pthread m dl)
+
+# Set compiler options and BLIS library for Linux.
+if(LINUX)
+    # Add compiler definition.
+    add_compile_options(-g -Wall -Wno-unused-function -Wfatal-errors -fPIC )
+
+    if(ENABLE_ASAN)
+       add_compile_options(-fsanitize=address)
+       add_definitions(-DENABLE_ASAN)
+    endif()
+
+    if(ENABLE_COVERAGE)
+        set(CMAKE_CXX_FLAGS "-O0 --coverage")
+    endif()
+
+    # Set GNU OpenMP library as the default option
+    set(OpenMP_LIBRARY "GNU" CACHE STRING "Using GNU OpenMP library")
+    # Set the possibe values of OpenMP runtimes
+    set_property(CACHE OpenMP_LIBRARY PROPERTY STRINGS "GNU" "Intel")
+
+    if(ENABLE_THREADING STREQUAL "no")
+        if(BLIS_LINKING_TYPE STREQUAL "static")
+            set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis.a" CACHE STRING "blis library path")
+        else()
+            set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis.so" CACHE STRING "blis library path")
+        endif()
+        find_library(libblis NAMES blis PATHS ${BLIS_PATH}/lib)
+    else()
+        if(BLIS_LINKING_TYPE STREQUAL "static")
+            set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis-mt.a" CACHE STRING "blis library path")
+        else()
+            set(Blis_LIBRARY "${BLIS_PATH}/lib/libblis-mt.so" CACHE STRING "blis library path")
+        endif()
+        find_library(libblis NAMES blis-mt PATHS ${BLIS_PATH}/lib)
+    endif()
+    if(${libblis} STREQUAL libblis-NOTFOUND)
+        message(FATAL_ERROR "Blis Library not found : " ${BLIS_PATH})
+    else()
+        message(STATUS "Found BLIS Library : " ${Blis_LIBRARY})
+    endif()
+endif()
+
+# Set BLIS library for Windows.
+if(WIN32)
+    if(ENABLE_THREADING STREQUAL "no")
+        if(BLIS_LINKING_TYPE STREQUAL "static")
+            set(Blis_LIBRARY "${BLIS_PATH}/bin/AOCL-LibBlis-Win.a" CACHE STRING "blis library path")
+        else()
+            set(Blis_LIBRARY "${BLIS_PATH}/bin/AOCL-LibBlis-Win-dll.lib" CACHE STRING "blis library path")
+        endif()
+    else()
+        if(BLIS_LINKING_TYPE STREQUAL "static")
+            set(Blis_LIBRARY "${BLIS_PATH}/bin/AOCL-LibBlis-Win-MT.a" CACHE STRING "blis library path")
+        else()
+            set(Blis_LIBRARY "${BLIS_PATH}/bin/AOCL-LibBlis-Win-MT-dll.lib" CACHE STRING "blis library path")
+        endif()
+    endif()
+
+endif()
+
+add_subdirectory(testinghelpers)
+add_subdirectory(testsuite)
+
+add_custom_target(distclean
+    COMMAND ${CMAKE_MAKE_PROGRAM} clean
+    COMMAND rm ${CMAKE_BINARY_DIR}/*.txt
+    COMMAND rm ${CMAKE_BINARY_DIR}/*.cmake
+    COMMAND rm ${CMAKE_BINARY_DIR}/Makefile
+    COMMAND rm -rf ${CMAKE_BINARY_DIR}/CMakeFiles
+    COMMAND rm -rf ${CMAKE_BINARY_DIR}/bin
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+    COMMENT "Remove cmake_generated files and executables"
+)
diff --git a/gtestsuite/README.md b/gtestsuite/README.md
new file mode 100644
index 0000000000..0cb220fbfe
--- /dev/null
+++ b/gtestsuite/README.md
@@ -0,0 +1,320 @@
+# Dependencies
+* GoogleTest is used as the tool for testing. The library is fetched and build at configuration time. No installation necessary.
+* An installation location for BLIS needs to be passed in as an argument during cmake invocation.
+* A path to a reference library needs to be passed during cmake invocation. Currently, MKL, OpenBLAS and Netlib are supported.
+
+# Layout
+The repo is organized as follows
+* testinghelpers
+    * inc
+        * common
+        * level1
+        * level2
+        * level3
+        * util
+    * src
+        * common
+        * level1
+        * level2
+        * level3
+        * util
+* testsuite
+    * inc
+    * level1
+        * addv
+        * axpyv
+    * level2
+    * level3
+    * utils
+
+where the code architecture is separated into two parts.
+
+First, from testinghelpers directory a library testinghelpers.a is created. This library holds helper functionality for the tests and functions to compute reference results using the CBLAS interface from the library that was provided as a reference during the CMake invokation.
+
+Testsuite includes headers from testinghelpers/inc and links testinghelpers.a.
+* `inc` directory has the comparison functionalit to determine SUCCESS or FAILURE of tests.
+* The subdirectories are named after the BLAS level or util (as described in BLIS documentation) and each level holds directories depending on the functionality. Then each of those, for example axpyv, consists of cpp files. This is the main directory where developers add their unit tests - on the corresponding functionality directory. There is one executable per functionality directory. For example, the executable with all tests for axpy, is named as testsuite/level1/axpy.There is the option to build and/or run the tests only for a specific API, or a specific level.
+
+# Basic CMake Configuration
+First create and `build` directory using
+```console
+$ mkdir build
+$ cd build
+```
+
+## Configure BLIS GTestSuite with OpenBLAS as reference
+```console
+$ cmake .. -DBLIS_PATH=/path_to_blis_installation -DREF_CBLAS=OpenBLAS -DOPENBLAS_PATH=/path_to_openblas_lib
+```
+## Configure BLIS GTestSuite with Netlib as reference
+```console
+$ cmake .. -DBLIS_PATH=/path_to_blis_installation -DREF_CBLAS=Netlib -DNETLIB_PATH=/path_to_netlib_lib
+```
+## Configure BLIS GTestSuite with MKL as reference
+```console
+$ cmake .. -DBLIS_PATH=/path_to_blis_installation -DREF_CBLAS=MKL [-DMKL_PATH=/path_to_mkl_lib]
+```
+MKL_PATH is an optional argument. The default is `$ENV{MKLROOT}/lib/intel64`.
+
+## Configure BLIS GTestSuite with any dynamic BLAS as reference
+```console
+$ cmake .. -DBLIS_PATH=/path_to_blis_installation -DREF_LIB_PATH=/path_to_blas_lib/anyblas.so
+```
+
+# Additional CMake Configuration Options
+There are multiple configuration options to chose from when invoking CMake. Those can be used in addition to the basic configuration above.
+## Compiler Options
+* `-DCMAKE_CXX_COMPILER=path_to_preferred_compiler` can be used to specify the compiler.
+* For example, to compile with Clang, use `-DCMAKE_CXX_COMPILER=clang++`.
+## Threading Options (Linux Only)
+* For single threaded BLIS, use `-DENABLE_THREADING=no`.
+* For multithreaded BLIS that uses pthreads, use `-DENABLE_THREADING=pthreads`.
+* For multithreaded BLIS that uses OpenMP, use `-DENABLE_THREADING=openmp`. [**Default**]
+    * In addition, to use Intel OpenMP runtime, use `-DOpenMP_LIBRARY=Intel`.
+    * For GNU OpenMP runtime, use `-DOpenMP_LIBRARY=GNU`. [**Default**]
+## BLIS Linking Type (Linux Only)
+* To link static BLIS, use `-DBLIS_LINKING_TYPE=static`. [**Default**]
+* To link shared BLIS, use `-DBLIS_LINKING_TYPE=shared`.
+## Integer Size
+* For testing a 32-bit integer BLIS library, use `-DINT_SIZE=32`. [**Default**"]
+* For testing a 64-bit integer BLIS library, use `-DINT_SIZE=64`.
+## Address Sanitizer
+* To build using address sanitizer, configure using `-DENABLE_ASAN=ON`. [**OFF by default**]
+* An installation to BLIS which was build with ASAN flags[CFLAGS="-O0 -g -fsanitize=address"] needs to be provided.
+## Code Coverage[Only GCC Compiler]
+* BLIS : Configure BLIS Library with code coverage flags[CFLAGS="-O0 -fprofile-arcs -ftest-coverage"], compile and install.
+* Gtestsuite : To build for code coverage, configure cmake with `-DENABLE_COVERAGE=ON`. [**OFF by default**] and then compile and run the executable.
+* CodeCoverage : in gtestsuite folder, run the below mentioned steps or bash script - to generate html LCOV-code coverage report.
+                 Run the bash script : bash codecov.sh <blis_obj_path> <out_dir_name>
+                                      or
+                 Steps to generate html LCOV-code coverage report.
+                 1. lcov --capture --directory <obj_path> --output-file <out_dir>.info
+                 2. lcov --remove <out_dir>.info -o <out_dir_fir>.info '/usr/*' '/*/_deps/*'
+                 3. genhtml <out_dir_fir>.info --output-directory <out_dir>
+                 4. In <out_dir>, open index.html file
+## BLIS Library Interface to be Tested
+* To build the testsuite using BLAS interface, configure using `-DTEST_INTERFACE=BLAS`. [**Default**]
+* To build the testsuite using CBLAS interface, configure using `-DTEST_INTERFACE=CBLAS`.
+* To build the testsuite using BLIS-typed interface, configure using `-DTEST_INTERFACE=BLIS_TYPED`. Note that more tests are built for this option, due to the extended APIs.
+
+# Building the Tests
+After the successful configuration of CMake, we can build the tests. The following steps are taken by the building process:
+1. Building testinghelpers.a.
+2. Getting and building GoogleTest libraries.
+3. Building the tests in testsuite.
+The code is modular and we can build and run all of the executables at once or only specific parts of the testsuite. The targets recursively, so that only tests related to specific functionality can be build and run.
+### To build everything use
+```console
+$ make -j
+```
+### To build all tests for a specific level use:
+```console
+$ make -j testsuite.level1
+```
+### To build all tests for a specific API use:
+```console
+$ make -j testsuite.level1.axpyv
+```
+## To build only the testing library use:
+```console
+$ make -j testinghelpers
+```
+This can be helpful if you are looking to understand how things are set up in testinghelpers.a.
+
+# Running Tests
+## Using CTest
+CTest is a test driver program; an executable that comes with CMake and handles running the tests for the project. CTest views each executable as a test. In reality, each executable has many GoogleTest tests implemented and if any of those fails, CTest will give a failure as well. To get a detailed report from CTest, please look into the log files in build/Testing/Temporary directory, which is created automatically.
+
+### To run all tests use:
+```console
+$ ctest
+```
+### To run all tests in parallel use:
+```console
+$ ctest -j3
+```
+The above command will run all tests using 3 threads.
+### To run tests of a specific level use:
+```console
+$ ctest -R level1
+```
+The above command will run only the level1 tests.
+### To run tests of a specific API use:
+```console
+$ ctest -R gemm
+```
+The above command will run only the gemm tests.
+## Other CTest options
+There are several other options that can be used when running CTest.
+One good example is using --test-load which is particularly helpful when the code is being tested is parallel and the option of running tests in parallel (e.g., `-j12`) has been used as well.
+To see what is available use:
+```console
+ctest --help
+```
+You can also find more details in [CMake Documentation](https://cmake.org/cmake/help/latest/manual/ctest.1.html).
+
+## Using the Executables
+As we mentioned earlier, all cpp files of each API directory are compiled into one executable. This executable can be run separately which can be very useful while developing or debugging.
+### To run all addv tests use:
+```console
+$ ./testsuite.level1.addv
+```
+### To run a more specific tests, say the snrm2 tests of nrm2, use:
+```console
+$ ./testuite.util.nrm2 --gtest_filter="*snrm2*"
+```
+## Running tests using Valgrind
+We can run any executable using valgrind as usual. For example, use the following command
+```console
+$ OMP_NUM_THREADS=1 valgrind ./testsuite.level3.gemm
+```
+
+## Clean cmake generated files
+```console
+$ make distclean
+```
+
+## Other GoogleTest options
+A list of useful options:
+### Test Selection
+--gtest_list_tests
+      List the names of all tests instead of running them. The name of
+      TEST(Foo, Bar) is "Foo.Bar".
+--gtest_filter=POSITIVE_PATTERNS[-NEGATIVE_PATTERNS]
+      Run only the tests whose name matches one of the positive patterns but
+      none of the negative patterns. '?' matches any single character; '*'
+      matches any substring; ':' separates two patterns.
+
+### Test Execution
+--gtest_repeat=[COUNT]
+      Run the tests repeatedly; use a negative count to repeat forever.
+
+### Test Output
+--gtest_brief=1
+      Only print test failures.
+--gtest_output=(json|xml)[:DIRECTORY_PATH/|:FILE_PATH]
+      Generate a JSON or XML report in the given directory or with the given
+      file name. FILE_PATH defaults to test_detail.xml.
+### Assertion Behavior
+--gtest_break_on_failure
+      Turn assertion failures into debugger break-points.
+--gtest_throw_on_failure
+      Turn assertion failures into C++ exceptions for use by an external
+      test framework.
+
+There are several other options that can be used when running an executable which has GoogleTests implemented. To see what is available use:
+```console
+./testsuite.util.nrm2 --help
+```
+
+# How to Add New Tests
+There are two ways to add new tests.
+### Modify an existing cpp file
+* Add any of the GoogleTest testing API, e.g., `TEST()` in any of the existing cpp files.
+* Rebuild.
+* Rerun.
+
+### Add a new cpp file
+* Add a cpp file which has any of the GoogleTest testing API, e.g., `TYPED_TEST()` calls in it.
+* Reconfigure cmake as mentioned above.
+* Rebuild.
+* Rerun.
+
+# Wrong Input Testing
+When testing for wrong input parameter values, then the code is meant to return early and the data of vectors and/or matrices should not be accessed. Therefore, the values of the elements of vectors and/or matrices are not important and thus we follow the methodolody of typed-testing. Since there are no error codes returned by the APIs, we write the tests doing the following:
+* check return value to be as expected. For example, in nrm2, the default returned value is 0. For gemm, C should not be modified so we check against a copy of C, before calling into gemm. Note that for APIs where there is pure output (e.g., norm from nrm2), since the default initialization of scalars is zero, it's a better practice to initialize the output to a nonzero value prior calling the API. That way we can ensure that the returned value was modified correctly by the function and it's not using the default value.
+* since the checks are expected to be done before any computation, use nullptr as inputs for the other F.P. data. For example, in nrm2, pass nullptr as x and in gemm, pass nullptr as A and B. If those get accessed, then the code would crush so that would show bugs.
+
+Currently, we have the following behaviour in the different interfaces:
+* BLIS-typed prints and aborts.
+* BLAS prints and returns.
+* CBLAS prints and exits.
+For that reason, we currently test only for BLAS APIs, so ensure to add the #ifdef's as appropriate. Note that printing seems to be inconsistent.
+
+A test program would be looking like the following:
+```cpp
+#include <gtest/gtest.h>
+#include "common/testing_helpers.h"
+#include "gemm.h"
+#include "inc/check_error.h"
+#include "common/wrong_inputs_helpers.h"
+
+/**
+ * Testing invalid/incorrect input parameters.
+ *
+ * storage : 'c', 'r', note BLAS is 'c' only.
+ * transa, transb : 'n', 't', 'c'
+ * m, n, k >= 0
+ * lda, ldb, ldc >= max(m/n/k, 1)
+*/
+template <typename T>
+class gemm_IIT : public ::testing::Test {};
+typedef ::testing::Types<float, double, scomplex, dcomplex> TypeParam;
+TYPED_TEST_SUITE(gemm_IIT, TypeParam);
+
+// Adding namespace to get default parameters from testinghelpers/common/wrong_input_helpers.h.
+using namespace testinghelpers::IIT;
+
+#ifdef TEST_BLAS
+TYPED_TEST(gemm_IIT, wrong_transa)
+{
+  using T = TypeParam;
+  // Create a vector with some default value.
+  std::vector<T> c(M*N, T{2.0});
+  // Copy so that we check that the elements of C are not modified.
+  std::vector<T> c_ref(c);
+  // Call BLIS gemm with a wrong value for transa.
+  gemm<T>( STORAGE, 'k', TRANS, M, N, K, nullptr, nullptr, LDA,
+                              nullptr, LDB, nullptr, c.data(), LDC );
+  // Use bitwise comparison (no threshold).
+  computediff<T>( STORAGE, M, N, c.data(), c_ref.data(), LDC);
+}
+#endif
+```
+
+# A short explanation on how lda increments for matrices work
+Say we have an m-by-n matrix A, which is stored in column major order. Then to access the elements we go through the matrix column by column. In this case, to store the full matrix in an array we need to store m * n elements and to access an element A(i,j), we need to know the size of the column, which is in this case m. Now let's assume that the matrix A is part of a bigger k-by-n matrix B. In this case, A is part of an array with k * n and to access an element A(i,j), we need to know the size of the column of B, which is in this case k. The leading dimension shows to us how many elements we need to go through, to be able to access the next column of a matrix. So, the requirement is to have lda >= max(1, m).
+
+     __________________                         __________________
+A = | |  |             |                  B =  | |  |             |
+    | |  |             |                       | |  |             |
+  m | |  |             |                    k  | |  |             |
+    | |  V             |                       | |  V             |
+    | |                |                       | |                |
+    | |  a             |                       | |  a             |
+    |_V________________|                       |_|________________|
+            n                                  | |      n         |
+                                               | |                |
+                                               | |                |
+                                               |_V________________|
+
+For an m-by-n matrix A, stored in row major order, we traverse the matrix row by row. We need to store full matrix A into an array of size m * n (as before) but now we use the number of columns to move to the next row as we iterate through the elements, which is n. Similarly to above, if A is part of a bigger matrix B of size m-by-l, in order to access an element A(i,j), we need to know the number of columns, l. In this case, the leading dimension shows to us how many elements we need to go through while traversing the matrix to access the next row of A. So, the requirement in this case is lda >= max(1, n).
+     __________________                         _________________________
+A = |                  |                  B =  |                  |      |
+    |                  |                       |                  |      |
+  m |                  |                    m  |                  |      |
+    |                  |                       |                  |      |
+    | -------------->  |                       |  ---------------------> |
+    | ---> a           |                       |  ---> a          |      |
+    |__________________|                       |__________________|______|
+            n                                               l
+
+Since in generic testing we generate tests for matrices with arbitrary sizes m and n and we need to check for column-major and row-major order in a generic way, using a fixed value for lda is not trivial. Consider the case where m and n take values from ::testing::Values(30, 40), and storage takes values from ::testing::Values('c','r').
+Then, we generate the following test combinations:
+1. m = 30, n = 30, storage = 'c'
+2. m = 30, n = 30, storage = 'r'
+3. m = 40, n = 40, storage = 'c'
+4. m = 40, n = 40, storage = 'r'
+5. m = 30, n = 40, storage = 'c'
+6. m = 30, n = 40, storage = 'r'
+7. m = 40, n = 30, storage = 'c'
+8. m = 40, n = 30, storage = 'r'
+
+If we want to test for different lda combinations as well, especially for the case where m != n, this would cause a problem as follows:
+If lda = 30, for the cases 3, 4, 7, 8 above, lda < max(1, m), so the requirement is not satisfied. Another issue is that lda depends on the storage type and on whether we test for non transpose, transpose or conjugate transpose matrices.
+
+To overcome this issue and generate tests which fullfill the requirements for the correct value of the leading dimension of a matrix we use **lda increments** and do the lda computation as follows:
+* Depending on the parameters storage and trans, compute lda = max(1, k), where k is m or n, depending on the requirements.
+* Add the lda_inc parameter: lda += lda_inc
+
+To test an m-by-n matrix A (column-major), stored in an array a, use lda_inc = 0 as a parameter to the test generator. To test for the case where A is a submatrix of k-by-n matrix B, use lda_inc = k-m.
diff --git a/gtestsuite/codecov.sh b/gtestsuite/codecov.sh
new file mode 100755
index 0000000000..da8cff3022
--- /dev/null
+++ b/gtestsuite/codecov.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+echo "Code Coverage for BLIS"
+echo "obj_dir_path : $1"
+echo "out_dir_name : $2"
+
+#$1 : obj_dir_path
+#$2 : out_dir_name
+
+lcov --capture --directory $1 --output-file $2.info
+lcov --remove $2.info -o $2_filtered.info '/usr/*' '/*/_deps/*'
+genhtml $2_filtered.info --output-directory $2
diff --git a/gtestsuite/testinghelpers/CMakeLists.txt b/gtestsuite/testinghelpers/CMakeLists.txt
new file mode 100644
index 0000000000..ab120e52da
--- /dev/null
+++ b/gtestsuite/testinghelpers/CMakeLists.txt
@@ -0,0 +1,56 @@
+#[=[
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+]=]
+
+file(GLOB_RECURSE SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "src/*/*.cpp")
+add_library(testinghelpers STATIC ${SOURCES})
+target_compile_definitions(testinghelpers PUBLIC REFERENCE_BLAS="${REF_LIB}")
+if(REF_CBLAS STREQUAL "MKL")
+    target_compile_definitions(testinghelpers PUBLIC MKL_CORE="${MKL_CORE_PATH}" MKL_GNU_THREAD="${MKL_GNU_THREAD_PATH}" REF_IS_MKL)
+elseif(REF_CBLAS STREQUAL "Netlib")
+    target_compile_definitions(testinghelpers PUBLIC REF_IS_NETLIB)
+elseif(REF_CBLAS STREQUAL "OpenBLAS")
+    target_compile_definitions(testinghelpers PUBLIC REF_IS_OPENBLAS)
+endif()
+if(TEST_INTERFACE STREQUAL "BLAS")
+    target_compile_definitions(testinghelpers PUBLIC TEST_BLAS)
+elseif(TEST_INTERFACE STREQUAL "CBLAS")
+    target_compile_definitions(testinghelpers PUBLIC TEST_CBLAS)
+else() # BLIS_TYPED option
+    target_compile_definitions(testinghelpers PUBLIC TEST_BLIS_TYPED)
+endif()
+if(INT_SIZE STREQUAL "32")
+    target_compile_definitions(testinghelpers PUBLIC INT_SIZE=32)
+else()
+    target_compile_definitions(testinghelpers PUBLIC INT_SIZE=64)
+endif()
+target_include_directories(testinghelpers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc ${BLIS_INCLUDE})
+target_link_libraries(testinghelpers pthread)
diff --git a/gtestsuite/testinghelpers/inc/common/complex_helpers.h b/gtestsuite/testinghelpers/inc/common/complex_helpers.h
new file mode 100644
index 0000000000..588144f7f5
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/common/complex_helpers.h
@@ -0,0 +1,82 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include <iostream>
+#include "common/type_info.h"
+
+namespace std {
+    // Overload std::abs to work with scomplex and dcomplex.
+    float abs(const scomplex x);
+    double abs(const dcomplex x);
+    // Overload the stream operator to be able to print scomplex in error messages.
+    ostream& operator<<(ostream& os, const scomplex& x);
+    ostream& operator<<(ostream& os, const dcomplex& x);
+}
+
+// Operator overloading for scomplex and dcomplex types.
+scomplex operator+(const scomplex x, const scomplex y);
+dcomplex operator+(const dcomplex x, const dcomplex y);
+
+scomplex operator-(const scomplex x, const scomplex y);
+dcomplex operator-(const dcomplex x, const dcomplex y);
+
+scomplex operator*(const scomplex x, const scomplex y);
+dcomplex operator*(const dcomplex x, const dcomplex y);
+
+bool operator== (const scomplex x, const scomplex y);
+bool operator== (const dcomplex x, const dcomplex y);
+
+bool operator!= (const scomplex x, const scomplex y);
+bool operator!= (const dcomplex x, const dcomplex y);
+
+// Since we may only specialise template functions and classes
+// in the std namespace for custom types, and std::to_string is not
+// a template function, we put to_string into our namespace.
+namespace testinghelpers {
+template<typename T>
+std::string to_string(const T& x) {
+    if constexpr (testinghelpers::type_info<T>::is_complex)
+    {
+        std::string ss = "(" + std::to_string(x.real) + ", " + std::to_string(x.imag) + ")";
+        return ss;
+    }
+    else
+    {
+        return std::to_string(x);
+    }
+}
+
+}
diff --git a/gtestsuite/testinghelpers/inc/common/data_generators.h b/gtestsuite/testinghelpers/inc/common/data_generators.h
new file mode 100644
index 0000000000..01bae20650
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/common/data_generators.h
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include <random>
+#include "common/type_info.h"
+
+namespace testinghelpers {
+namespace datagenerators {
+
+/***************************************************
+ *              Random Generators
+****************************************************/
+/**
+ * @brief Returns a random int/float converted to an fp type (float, double, scomplex, dcomplex)
+ *        that lies in the range [from, to].
+ *
+ * @param[in, out] alpha the random fp
+ */
+template<typename T>
+void randomgenerators(int from, int to, T* alpha, char fp);
+
+/**
+ * @brief Returns a random vector (float, double, scomplex, dcomplex)
+ *        with elements that are integers or floats, depending on char, and follow a uniform distribution in the range [from, to].
+ * @param[in] n length of vector x
+ * @param[in] incx increments of vector x
+ * @param[in, out] x the random fp vector
+ * @param[in] fp if fp=='i' the elements will have random integer values.
+ *               if fp=='f' the elements will have random float values.
+ */
+template<typename T>
+void randomgenerators(int from, int to, gtint_t n, gtint_t incx, T* x, char fp);
+
+template<typename T>
+void randomgenerators(int from, int to, char storage, gtint_t m, gtint_t n, T* a, gtint_t lda, char fp);
+
+template<typename T>
+void randomgenerators(int from, int to, char storage, gtint_t m, gtint_t n, T* a, char transa, gtint_t lda, char fp);
+
+template<typename T>
+void randomgenerators(int from, int to, char storage, char uplo, gtint_t m,
+                    T* a, gtint_t lda, char datatype);
+} //end of namespace datagenerators
+
+template<typename T>
+std::vector<T> get_random_matrix(int from, int to, char storage, char trans, gtint_t m, gtint_t n,
+                    gtint_t lda, char datatype);
+
+template<typename T>
+std::vector<T> get_random_matrix(int from, int to, char storage, char uplo, gtint_t k,
+                    gtint_t lda, char datatype);
+
+template<typename T>
+std::vector<T> get_random_vector(int from, int to, gtint_t n, gtint_t incx, char datatype);
+
+template<typename T>
+std::vector<T> get_vector( gtint_t n, gtint_t incx, T value );
+
+template<typename T>
+std::vector<T> get_matrix( char storage, char trans, gtint_t m, gtint_t n, gtint_t lda, T value );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/common/error_helpers.h b/gtestsuite/testinghelpers/inc/common/error_helpers.h
new file mode 100644
index 0000000000..c61714d707
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/common/error_helpers.h
@@ -0,0 +1,101 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include <limits>
+#include "common/type_info.h"
+
+namespace testinghelpers {
+
+/**
+ * @brief Returns the value of machine epsilon depending on the type.
+ *        For scomplex and dcomplex, returns the value of machine epsilon
+ *        for float and double, respectively.
+ *        Epsilon is used as the basis for setting the threshold used for
+ *        SUCCESS or FAILURE of tests.
+ */
+template<typename T>
+double getEpsilon()
+{
+    using RT = typename testinghelpers::type_info<T>::real_type;
+    double eps = std::numeric_limits<RT>::epsilon();
+    return eps;
+}
+
+/**
+ * @brief Returns the relative error. Relative error is used in most cases since
+ *        it takes into account the magnitude of the exact and approx.
+ *        For the cases where we are comparing very small values, that is values
+ *        which are approximately zero, division with zero will cause inf/NaN.
+ *        For example, if exact=0 and approx=0, getRelativeError() would return -NaN.
+ */
+template<typename T>
+double getRelativeError(T exact, T approx)
+{
+    double rel_err;
+    rel_err = std::abs(exact - approx)/std::abs(exact);
+    return rel_err;
+}
+
+/**
+ * @brief Returns the absolute error. Absolute error is used for the cases where
+ *        we are comparing very small values, where relative error cannot be used.
+ *        For example, on the example above where exact=0 and approx=0,
+ *        getAbsoluteError() would return 0.
+ *
+ *        Absolute error doesn't take into account magnitude which means that for
+ *        large values this could give false negatives.
+ *        For example, if T is float, exact=598320.943 and approx=598320.9431,
+ *        getAbsoluteError() would return 0.0001, compared to the relative error of ~2e-10.
+ */
+template<typename T>
+double getAbsoluteError(T exact, T approx)
+{
+    double abs_err;
+    abs_err = std::abs(exact - approx);
+    return abs_err;
+}
+
+template<typename T>
+double getError(T exact, T approx)
+{
+    if ( std::abs(exact) > 1 )
+        return getRelativeError(exact, approx);
+    else
+        return getAbsoluteError(exact, approx);
+}
+
+
+} // end of testinghelpers namespace
diff --git a/gtestsuite/testinghelpers/inc/common/refCBLAS.h b/gtestsuite/testinghelpers/inc/common/refCBLAS.h
new file mode 100644
index 0000000000..fbdfe76e6d
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/common/refCBLAS.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <dlfcn.h>
+#include <stdexcept>
+
+namespace testinghelpers {
+class refCBLAS
+{
+  private:
+#ifdef REF_IS_MKL
+    void *MKLCoreModule = nullptr;
+    void *MKLGNUThreadModule = nullptr;
+#endif
+    void *refCBLASModule = nullptr;
+
+  public:
+    refCBLAS();
+    ~refCBLAS();
+    void* get();
+};
+} //end of testinghelpers namespace
+
+extern thread_local testinghelpers::refCBLAS refCBLASModule;
diff --git a/gtestsuite/testinghelpers/inc/common/testing_basics.h b/gtestsuite/testinghelpers/inc/common/testing_basics.h
new file mode 100644
index 0000000000..3176562a72
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/common/testing_basics.h
@@ -0,0 +1,278 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include <stdio.h>
+#include <iostream>
+#include "cblas.h"
+#include "common/type_info.h"
+
+namespace testinghelpers {
+
+void char_to_blis_trans( char trans, trans_t* blis_trans );
+void char_to_blis_conj( char conj, conj_t* blis_conj );
+void char_to_blis_side( char side, side_t* blis_side );
+void char_to_blis_uplo( char uplo, uplo_t* blis_uplo );
+void char_to_blis_diag( char diag, diag_t* blis_diag );
+
+void char_to_cblas_order( char order, CBLAS_ORDER* cblas_order );
+void char_to_cblas_trans( char trans, CBLAS_TRANSPOSE* cblas_trans );
+void char_to_cblas_uplo( char uplo, CBLAS_UPLO* cblas_uplo );
+void char_to_cblas_diag( char diag, CBLAS_DIAG* cblas_diag );
+void char_to_cblas_side( char side, CBLAS_SIDE* cblas_side );
+
+/**
+ * @brief Returns the size of a buffer which has strides.
+ *
+ * @param n length of vector
+ * @param incx increment
+ * @return gtint_t dimension of the buffer that stored a vector with length n and increment incx
+ */
+gtint_t buff_dim(gtint_t n, gtint_t incx);
+
+/**
+ * @brief Returns the size of matrix.
+ *
+ * @param storage specifies the storage format of matrix in memory.
+ * @param trns    specifies the form of given matrix.
+ * @param m       specifies the number of rows of given matrix.
+ * @param n       specifies the number of columns of given matrix.
+ * @param ldm     specifies the leading dimension of given matrix.
+  * @return gtint_t  Size of the matrix for dimension (m,n) and strides(rs,cs).
+ */
+gtint_t matsize(char storage, char trans, gtint_t m, gtint_t n, gtint_t ldm );
+
+/**
+ * Returns the leading dimension of a matrix depending on the storage type,
+ * whether it is transpose or not, and the size of rows and columns.
+ *
+ * @param storage specifies the storage format of matrix in memory.
+ * @param trns    specifies the form of given matrix.
+ * @param m       specifies the number of rows of given matrix.
+ * @param n       specifies the number of columns of given matrix.
+ * @param inc     specifies the increment of the leading dimension.
+*/
+gtint_t get_leading_dimension(char storage, char trans, gtint_t m, gtint_t n, gtint_t inc);
+
+/**
+ * If T is real, returns NaN.
+ * If T is complex, returns {NaN, 0.0}
+*/
+template<typename T>
+T getNaN();
+
+/**
+ * If T is real, returns inf.
+ * If T is complex, returns {inf, 0.0}
+*/
+template<typename T>
+T getInf();
+
+/**
+ * @brief Returns the conjugate of a scalar x.
+ *
+ * @tparam T float, double, scomplex, dcomplex
+ * @param x scalar of type T
+ * @return T conjugate of x
+ */
+template<typename T>
+static T conj(T &x){
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        return x;
+    else
+        return {x.real, -x.imag};
+}
+
+template <typename T>
+void conj(T* x, gtint_t len, gtint_t inx)
+{
+    gtint_t i, ix;
+    ix = 0;
+    for( i = 0 ; i < len ; i++ )
+    {
+      x[ix] = conj<T>(x[ix]);
+      ix = ix + inx;
+    }
+    return;
+}
+
+template <typename T>
+void conj(char storage, T* X, gtint_t m, gtint_t n, gtint_t ldm )
+{
+    gtint_t i,j;
+    gtint_t rs, cs;
+    rs=cs=1;
+    if( (storage == 'c') || (storage == 'C') )
+        cs = ldm ;
+    else
+        rs = ldm ;
+
+    for( i = 0 ; i < m ; i++ )
+    {
+        for( j = 0 ; j < n ; j++ )
+        {
+            X[i*rs + j*cs] = conj<T>( X[i*rs + j*cs] );
+        }
+    }
+    return;
+}
+
+template<typename T>
+static void initone(T &x) {
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        x = 1.0;
+    else
+        x = {1.0, 0.0};
+}
+
+template<typename T>
+static void initzero(T &x) {
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        x = 0.0;
+    else
+        x = {0.0, 0.0};
+}
+
+template<typename T>
+static void alphax( gtint_t n, T alpha, T *xp, gtint_t incx )
+{
+    gtint_t i = 0;
+    gtint_t ix = 0;
+    for(i = 0 ; i < n ; i++) {
+        xp[ix] = (alpha * xp[ix]);
+        ix = ix + incx;
+    }
+}
+
+/**
+ * @brief Returns the boolean form of a trans value.
+ *
+ * @param trans specifies the form of matrix stored in memory.
+ * @return boolean of the transform of the matrix.
+ */
+bool chktrans( char trans );
+bool chknotrans( char trans );
+bool chkconjtrans( char trans );
+bool chktransconj( char trans );
+bool chkconj( char trans );
+
+
+/**
+ * @brief Returns the boolean form of a matrix triangular form.
+ *
+ * @param uplo specifies whether matrix is upper or lower triangular stored in memory.
+ * @return boolean of the triangular form of the matrix.
+ */
+bool chkupper( char uplo );
+bool chklower( char uplo );
+
+/**
+ * @brief Returns the boolean form of a matrix unit/non-unit diagonal form.
+ *
+ * @param diag specifies whether matrix is unit or non-unit diagonal form.
+ * @return boolean of the diagonal form of the matrix.
+ */
+bool chkunitdiag( char diag );
+bool chknonunitdiag( char diag );
+
+/**
+ * @brief Returns the boolean form of a matrix left/right side.
+ *
+ * @param side specifies whether matrix is left or right side form.
+ * @return boolean of the side of the matrix.
+ */
+bool chksideleft( char side );
+bool chksideright( char side );
+
+/**
+ * @brief swap the dimensions and strides of the matrix based on trans
+ *
+ * @param trans specifies the form of matrix stored in memory.
+ * @param m       specifies the number of rows of given matrix.
+ * @param n       specifies the number of columns of given matrix.
+ * @param rs      specifies the row stride of given matrix.
+ * @param cs      specifies the column stride of given matrix.
+ * @param mt      pointer to the row number of given matrix.
+ * @param nt      pointer to the column number of given matrix.
+ * @param rst     pointer to the row stride of given matrix.
+ * @param cst     pointer to the column stride of given matrix.
+ */
+void swap_dims_with_trans( char trans,
+                           gtint_t  m,  gtint_t  n,  gtint_t  rs,  gtint_t  cs,
+                           gtint_t* mt, gtint_t* nt, gtint_t* rst, gtint_t* cst );
+/**
+ * @brief swap the strides of the matrix based on trans
+ *
+ * @param trans specifies the form of matrix stored in memory.
+ * @param rs      specifies the row stride of given matrix.
+ * @param cs      specifies the column stride of given matrix.
+ * @param rst     pointer to the row stride of given matrix.
+ * @param cst     pointer to the column stride of given matrix.
+ */
+void swap_strides_with_trans( char trans,
+                                     gtint_t  rs,  gtint_t  cs,
+                                     gtint_t* rst, gtint_t* cst );
+
+/**
+ * @brief swap the dimensions
+ *
+ * @param trans specifies the form of matrix stored in memory.
+ * @param x     pointer to the dimension of given vector/matrix.
+ * @param y     pointer to the dimension of given vector/matrix.
+ */
+void swap_dims( gtint_t* x, gtint_t* y );
+
+/**
+ * @brief set the dimension of the matrix based on trans
+ *
+ * @param trans specifies the form of matrix stored in memory.
+ * @param m       specifies the number of rows of given matrix.
+ * @param n       specifies the number of columns of given matrix.
+ * @param mt      pointer to the row number of given matrix.
+ * @param nt      pointer to the column number of given matrix.
+ */
+void set_dims( char trans, gtint_t m, gtint_t n, gtint_t* mt, gtint_t* nt );
+
+/**
+ * @brief set the dimension of the matrix based on side
+ *
+ * @param side    specifies the side of matrix selected in memory.
+ * @param m       specifies the number of rows of given matrix.
+ * @param n       specifies the number of columns of given matrix.
+ * @param dim     pointer to the dimension based on side.
+ */
+void set_dim_with_side( char side, gtint_t m, gtint_t n, gtint_t* dim );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/common/testing_helpers.h b/gtestsuite/testinghelpers/inc/common/testing_helpers.h
new file mode 100644
index 0000000000..3720109148
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/common/testing_helpers.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "type_info.h"
+#include "testing_basics.h"
+#include "complex_helpers.h"
+#include "data_generators.h"
+#include "error_helpers.h"
+#include "refCBLAS.h"
diff --git a/gtestsuite/testinghelpers/inc/common/type_info.h b/gtestsuite/testinghelpers/inc/common/type_info.h
new file mode 100644
index 0000000000..05cb0d1f76
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/common/type_info.h
@@ -0,0 +1,72 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+
+// Set the integer type that we use in testing depending on the CMake option.
+#if INT_SIZE == 32
+    using gtint_t = int32_t;
+    using ugtint_t = uint32_t;
+#elif INT_SIZE == 64
+    using gtint_t = int64_t;
+    using ugtint_t = uint64_t;
+#endif
+
+namespace testinghelpers {
+    // type_info<T>::real_type will return the real type of T.
+    // If T is float or double, real_type is float or double respectivelly.
+    // If T is scomplex or dcomplex, real_type is float or double respectivelly.
+    template<typename T>
+    struct type_info {
+        using real_type = T;
+        static constexpr bool is_complex = false;
+        static constexpr bool is_real = true;
+    };
+
+    template<>
+    struct type_info<scomplex> {
+        using real_type = float;
+        static constexpr bool is_complex = true;
+        static constexpr bool is_real = false;
+    };
+
+    template<>
+    struct type_info<dcomplex> {
+        using real_type = double;
+        static constexpr bool is_complex = true;
+        static constexpr bool is_real = false;
+    };
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/common/wrong_inputs_helpers.h b/gtestsuite/testinghelpers/inc/common/wrong_inputs_helpers.h
new file mode 100644
index 0000000000..e66f4f3168
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/common/wrong_inputs_helpers.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+#include "common/testing_helpers.h"
+
+/**
+ * This header holds correct valid BLAS parameters. Will be used for wrong input testing.
+*/
+namespace testinghelpers {
+namespace IIT {
+  static const char STORAGE = 'c';
+  static const char TRANS = 'n';
+  static const char SIDE = 'l';
+  static const char UPLO = 'u';
+  static const char DIAG = 'u';
+  static const gtint_t M = 4;
+  static const gtint_t N = 4;
+  static const gtint_t K = 4;
+  static const gtint_t INC = 1;
+  static const gtint_t LDA = 4;
+  static const gtint_t LDB = 4;
+  static const gtint_t LDC = 4;
+}
+}
diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_f32_kern_macros.h b/gtestsuite/testinghelpers/inc/level1/ref_addv.h
similarity index 59%
rename from addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_f32_kern_macros.h
rename to gtestsuite/testinghelpers/inc/level1/ref_addv.h
index c8c2a04c91..c693369b90 100644
--- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_f32_kern_macros.h
+++ b/gtestsuite/testinghelpers/inc/level1/ref_addv.h
@@ -1,66 +1,52 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-	- Redistributions of source code must retain the above copyright
-	  notice, this list of conditions and the following disclaimer.
-	- Redistributions in binary form must reproduce the above copyright
-	  notice, this list of conditions and the following disclaimer in the
-	  documentation and/or other materials provided with the distribution.
-	- Neither the name(s) of the copyright holder(s) nor the names of its
-	  contributors may be used to endorse or promote products derived
-	  from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#include "aocl_bf16_type.h"
-
-#ifndef LPGEMM_F32_KERN_MACROS_H
-#define LPGEMM_F32_KERN_MACROS_H
-
-#define RELU_SCALE_OP_F32_AVX512(reg) \
-	/* Generate indenx of elements <= 0.*/ \
-	relu_cmp_mask = _mm512_cmple_ps_mask( reg, selector1 ); \
- \
-	/* Apply scaling on for <= 0 elements.*/ \
-	reg = _mm512_mask_mul_ps( reg, relu_cmp_mask, reg, selector2 ); \
-
-#define CVT_F32_BF16(reg,m_ind,n_ind) \
-	_mm256_storeu_epi16 \
-	( \
-	  ( bfloat16* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + m_ind ) ) + post_op_c_j + ( n_ind * 16 ), \
-	  (__m256i) \
-		_mm512_cvtneps_pbh( reg ) \
-	) \
-
-#define CVT_F32_BF16_LT16(reg,m_ind,n_ind) \
-	_mm256_storeu_epi16 \
-	( \
-	  buf0, \
-		(__m256i) \
-		_mm512_cvtneps_pbh( reg ) \
-	); \
-	memcpy( ( bfloat16* )post_ops_list_temp->op_args3 + \
-	  ( rs_c_downscale * ( post_op_c_i + m_ind ) ) + post_op_c_j + \
-	  ( n_ind * 16 ) , buf0, ( n0_rem * sizeof( bfloat16 ) ) ); \
-
-#endif // LPGEMM_F32_KERN_MACROS_H
\ No newline at end of file
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  ADDV performs vector operations
+ *     y := y + conjx(x)
+ *     where x and y are vectors of length n
+ *   ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_addv(char conjx, gtint_t len, const T* X, gtint_t incx, T* Y, gtint_t incy);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_amaxv.h b/gtestsuite/testinghelpers/inc/level1/ref_amaxv.h
new file mode 100644
index 0000000000..a4d2e7fe40
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_amaxv.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  Given a vector of length n, return the zero-based index of
+ *  the element of vector x that contains the largest absolute value
+ *  (or, in the complex domain, the largest complex modulus).
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+gtint_t ref_amaxv(gtint_t n, const T* x, gtint_t incx);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_axpbyv.h b/gtestsuite/testinghelpers/inc/level1/ref_axpbyv.h
new file mode 100644
index 0000000000..893583638d
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_axpbyv.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  AXPYV performs vector operations
+ *     y := beta * y + alpha * conjx(x)
+ *     where x and y are vectors of length n, and alpha, beta are scalars
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_axpbyv(char conjx, gtint_t len, const T alpha,
+                        const T* xp, gtint_t incx, const T beta, T* yp, gtint_t incy);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_axpyv.h b/gtestsuite/testinghelpers/inc/level1/ref_axpyv.h
new file mode 100644
index 0000000000..d0cbbbbf5f
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_axpyv.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  AXPYV performs vector operations
+ *     y := y + alpha * conjx(x)
+ *     where x and y are vectors of length n, and alpha is a scalar
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_axpyv(char conjx, gtint_t len, const T alpha,
+                        const T* xp, gtint_t incx, T* yp, gtint_t incy);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_copyv.h b/gtestsuite/testinghelpers/inc/level1/ref_copyv.h
new file mode 100644
index 0000000000..5342ea3526
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_copyv.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  COPYV performs vector operations
+ *     y := conjx(x)
+ *     where x and y are vectors of length n.
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_copyv(char conjx, gtint_t n, const T* x, gtint_t incx,
+                                            T* y, gtint_t incy);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_dotv.h b/gtestsuite/testinghelpers/inc/level1/ref_dotv.h
new file mode 100644
index 0000000000..2b1f0b4a4d
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_dotv.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  DOTV performs vector operations
+ *     rho := conjx(x)^T * conjy(y)
+ *     where x and y are vectors of length n, and rho is a scalar.
+ *  ==========================================================================
+ */
+
+namespace testinghelpers {
+template<typename T>
+void ref_dotv(gtint_t n, const T* x, gtint_t incx, const T* y,
+                                          gtint_t incy, T* rho);
+
+template<typename T>
+void ref_dotv(char conjx, char conjy, gtint_t n, const T* x,
+                  gtint_t incx, const T* y, gtint_t incy, T* rho);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_dotxv.h b/gtestsuite/testinghelpers/inc/level1/ref_dotxv.h
new file mode 100644
index 0000000000..8b662a05db
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_dotxv.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  DOTXV performs vector operations
+ *     rho := beta * rho + alpha * conjx(x)^T * conjy(y)
+ *     where x and y are vectors of length n, and alpha, beta, and rho are scalars.
+ *  ==========================================================================
+ */
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_dotxv(char conjx, char conjy, gtint_t n, const T alpha,
+  const T* x, gtint_t incx, const T* y, gtint_t incy, const T beta, T* rho);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_scal2v.h b/gtestsuite/testinghelpers/inc/level1/ref_scal2v.h
new file mode 100644
index 0000000000..88a933d6f4
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_scal2v.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  SCAL2V performs a vector operation
+ *     y := alpha * conj(x) (BLIS interface only)
+ *  where x and y are vectors of length n, and alpha is a scalar
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_scal2v(char conjx, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_t incy);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_scalv.h b/gtestsuite/testinghelpers/inc/level1/ref_scalv.h
new file mode 100644
index 0000000000..6e52878835
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_scalv.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  SCALV performs a vector operation
+ *     x := alpha * x
+ *  or x := conjalpha(alpha)*x (BLIS interface only)
+ *  where x is a vector of length n, and alpha is a scalar
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_scalv(char conjalpha, gtint_t len, T alpha, T* x, gtint_t incx);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_subv.h b/gtestsuite/testinghelpers/inc/level1/ref_subv.h
new file mode 100644
index 0000000000..dd49b2571a
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_subv.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  SUBV performs vector operations
+ *     y := y - conjx(x)
+ *     where x and y are vectors of length n
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_subv(char conjx, gtint_t len, const T* X, gtint_t incx, T* Y, gtint_t incy);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level1/ref_xpbyv.h b/gtestsuite/testinghelpers/inc/level1/ref_xpbyv.h
new file mode 100644
index 0000000000..92afc208ee
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level1/ref_xpbyv.h
@@ -0,0 +1,53 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  AXPYV performs vector operations
+ *     y := beta * y + conjx(x)
+ *     where x and y are vectors of length n, and beta is a scalar
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_xpbyv(char conjx, gtint_t len,
+                        const T* xp, gtint_t incx, const T beta, T* yp, gtint_t incy);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_gemv.h b/gtestsuite/testinghelpers/inc/level2/ref_gemv.h
new file mode 100644
index 0000000000..6f9a7c88de
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_gemv.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * GEMV performs one of the matrix-vector operations
+ *    y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+ *    y := alpha*A**H*x + beta*y,
+ * ==========================================================================
+*/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_gemv(  char storage, char trans, char conjx, gtint_t m, gtint_t n,
+    T alpha, T *ap, gtint_t lda, T *xp, gtint_t incx, T beta,
+    T *yp, gtint_t incy );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_ger.h b/gtestsuite/testinghelpers/inc/level2/ref_ger.h
new file mode 100644
index 0000000000..d104c17659
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_ger.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * GER performs the rank 1 operation
+ *    A := alpha*x*y**T + A,
+ * where alpha is a scalar, x is an m element vector, y is an n element
+ * vector and A is an m by n matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n,
+    T alpha, T *xp, gtint_t incx, T *yp, gtint_t incy, T *ap, gtint_t lda );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_hemv.h b/gtestsuite/testinghelpers/inc/level2/ref_hemv.h
new file mode 100644
index 0000000000..52100da1f6
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_hemv.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * HEMV performs the matrix-vector  operation
+ *    y := alpha*A*x + beta*y
+ * where alpha and beta are scalars, x and y are n element vectors and
+ * A is an n by n hermitian matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_hemv( char storage, char uploa, char conja, char conjx, gtint_t n,
+    T* alpha, T *ap, gtint_t lda, T *xp, gtint_t incx, T* beta,
+    T *yp, gtint_t incy );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_her.h b/gtestsuite/testinghelpers/inc/level2/ref_her.h
new file mode 100644
index 0000000000..0c403f5e12
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_her.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * HER performs the hermitian rank 1 operation
+ *    A := alpha*x*x**H + A
+ *  where alpha is a real scalar, x is an n element vector and A is an
+ *  n by n hermitian matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T, typename Tr>
+void ref_her( char storage, char uploa, char conjx, gtint_t n,
+    Tr alpha, T *xp, gtint_t incx, T *ap, gtint_t lda );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_her2.h b/gtestsuite/testinghelpers/inc/level2/ref_her2.h
new file mode 100644
index 0000000000..ee56f84abb
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_her2.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * HER2  performs the hermitian rank 2 operation
+ *    A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+ * where alpha is a scalar, x and y are n element vectors and A is an n
+ * by n hermitian matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_her2( char storage, char uploa, char conjx, char conjy, gtint_t n,
+    T* alpha, T *xp, gtint_t incx, T *yp, gtint_t incy, T *ap, gtint_t lda );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_symv.h b/gtestsuite/testinghelpers/inc/level2/ref_symv.h
new file mode 100644
index 0000000000..7d324e99cb
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_symv.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * SYMV performs the matrix-vector  operation
+ *    y := alpha*A*x + beta*y
+ * where alpha and beta are scalars, x and y are n element vectors and
+ * A is an n by n symmetric matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_symv( char storage, char uploa, char conja, char conjx, gtint_t n,
+    T* alpha, T *ap, gtint_t lda, T *xp, gtint_t incx, T* beta,
+    T *yp, gtint_t incy );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_syr.h b/gtestsuite/testinghelpers/inc/level2/ref_syr.h
new file mode 100644
index 0000000000..3727ec1aa9
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_syr.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * SYR performs the symmetric rank 1 operation
+ *    A := alpha*x*x**T + A,
+ *  where alpha is a real scalar, x is an n element vector and A is an
+ *  n by n symmetric matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_syr( char storage, char uploa, char conjx, gtint_t n,
+             T alpha, T *xp, gtint_t incx, T *ap, gtint_t lda );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_syr2.h b/gtestsuite/testinghelpers/inc/level2/ref_syr2.h
new file mode 100644
index 0000000000..232171de28
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_syr2.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * SYR2  performs the symmetric rank 2 operation
+ *    A := alpha*x*y**T + alpha*y*x**T + A,
+ * where alpha is a scalar, x and y are n element vectors and A is an n
+ * by n symmetric matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_syr2( char storage, char uploa, char conjx, char conjy, gtint_t n,
+    T alpha, T *xp, gtint_t incx, T *yp, gtint_t incy, T *ap, gtint_t lda );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_trmv.h b/gtestsuite/testinghelpers/inc/level2/ref_trmv.h
new file mode 100644
index 0000000000..b7d8f1020f
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_trmv.h
@@ -0,0 +1,54 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * TRMV  performs one of the matrix-vector operations
+ *    x := alpha * transa(A) * x
+ * where x is an n element vector and  A is an n by n unit, or non-unit,
+ * upper or lower triangular matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trmv( char storage, char uploa, char transa, char diaga,
+    gtint_t n, T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level2/ref_trsv.h b/gtestsuite/testinghelpers/inc/level2/ref_trsv.h
new file mode 100644
index 0000000000..268b7f381e
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level2/ref_trsv.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * TRSV Solves a triangular system of equations with a single value for the
+ *        right side
+ *    b := alpha * inv(transa(A)) * x_orig
+ * where b and x are n element vectors and A is an n by n unit, or non-unit,
+ * upper or lower triangular matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trsv( char storage, char uploa, char transa, char diaga,
+    gtint_t n, T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_gemm.h b/gtestsuite/testinghelpers/inc/level3/ref_gemm.h
new file mode 100644
index 0000000000..569726cdf9
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_gemm.h
@@ -0,0 +1,63 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ *  GEMM performs one of the matrix-matrix operations
+ *     C := alpha*op( A )*op( B ) + beta*C,
+ *  where  op( A ) is one of
+ *     op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H,
+ *  alpha and beta are scalars, and A, B and C are matrices, with op( A )
+ *  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+   ==========================================================================
+*/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_gemm (
+    char storage, char trnsa, char trnsb,
+    gtint_t m, gtint_t n, gtint_t k,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_gemmt.h b/gtestsuite/testinghelpers/inc/level3/ref_gemmt.h
new file mode 100644
index 0000000000..6c2f58ca3f
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_gemmt.h
@@ -0,0 +1,64 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * GEMMT performs one of the matrix-matrix operations
+ *    C := alpha*op( A )*op( B ) + beta*C,
+ * where  op( X ) is one of
+ *    op( X ) = X   or   op( X ) = A**T   or   op( X ) = X**H,
+ * alpha and beta are scalars, and A, B and C are matrices, with op( A )
+ * an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+ * Only accesses and updates the upper or the lower triangular part.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_gemmt (
+    char storage, char uplo, char trnsa, char trnsb,
+    gtint_t n, gtint_t k,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_hemm.h b/gtestsuite/testinghelpers/inc/level3/ref_hemm.h
new file mode 100644
index 0000000000..40d4178239
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_hemm.h
@@ -0,0 +1,68 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * For BLIS-typed interface HEMM performs one of the matrix-matrix operations
+ *    C := alpha*conj( A )*trans( B ) + beta*C, if side is left
+ * or C := alpha*trans( B )*conj( A ) + beta*C, if side is right
+ * alpha and beta are scalars, and A is Hermitian, B and C are matrices, with conj( A )
+ * an m by m matrix, and trans( B ) and C m by n matrices.
+ *
+ * For BLAS/CBLAS interface HEMM performs one of the matrix-matrix operations
+ *    C := alpha*A*B + beta*C, if side is left
+ * or C := alpha*B*A + beta*C, if side is right
+ * alpha and beta are scalars, and A is Hermitian, B and C are matrices, with A
+ * an m by m matrix, and B and C m by n matrices.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_hemm(
+    char storage, char side, char uplo, char conja, char transb,
+    gtint_t m, gtint_t n,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_her2k.h b/gtestsuite/testinghelpers/inc/level3/ref_her2k.h
new file mode 100644
index 0000000000..3827625036
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_her2k.h
@@ -0,0 +1,64 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  HER2K  performs one of the symmetric rank 2k operations
+ *     C := alpha*A*B**H + alpha*B*A**H + beta*C,
+ *  or
+ *    C := alpha*A**T*B + alpha*B**T*A + beta*C,
+ *  where  alpha and beta  are scalars, C is an  n by n  symmetric matrix
+ *  and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+ *  matrices in the second case.
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+void ref_her2k(
+    char storage, char uplo, char transa, char transb,
+    gtint_t m, gtint_t k,
+    T* alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    RT beta,
+    T* cp, gtint_t ldc
+);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_herk.h b/gtestsuite/testinghelpers/inc/level3/ref_herk.h
new file mode 100644
index 0000000000..ca29a1217d
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_herk.h
@@ -0,0 +1,59 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  C := alpha*A*A**H + beta*C,
+ *       or
+ *  C := alpha*A**H*A + beta*C,
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+void ref_herk(
+    char storage, char uplo, char transa,
+    gtint_t m, gtint_t k,
+    RT alpha,
+    T* ap, gtint_t lda,
+    RT beta,
+    T* cp, gtint_t ldc
+);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_symm.h b/gtestsuite/testinghelpers/inc/level3/ref_symm.h
new file mode 100644
index 0000000000..fef81db386
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_symm.h
@@ -0,0 +1,68 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * For BLIS-typed interface SYMM performs one of the matrix-matrix operations
+ *     C := alpha*conj( A )*trans( B ) + beta*C, if side is left
+ *  or C := alpha*trans( B )*conj( A ) + beta*C, if side is right
+ *  alpha and beta are scalars, and A is symmetric, B and C are matrices, with conj( A )
+ *  an m by m matrix, and trans( B ) and C m by n matrices.
+ *
+ *  For BLAS/CBLAS interface SYMM performs one of the matrix-matrix operations
+ *     C := alpha*A*B + beta*C, if side is left
+ *  or C := alpha*B*A + beta*C, if side is right
+ *  alpha and beta are scalars, and A is symmetric, B and C are matrices, with A
+ *  an m by m matrix, and B and C m by n matrices.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_symm(
+    char storage, char side, char uplo, char conja, char transb,
+    gtint_t m, gtint_t n,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_syr2k.h b/gtestsuite/testinghelpers/inc/level3/ref_syr2k.h
new file mode 100644
index 0000000000..4b170d70a8
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_syr2k.h
@@ -0,0 +1,64 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ *  SYR2K  performs one of the syr2ketric rank 2k operations
+ *     C := alpha*A*B**T + alpha*B*A**T + beta*C,
+ *  or
+ *     C := alpha*A**T*B + alpha*B**T*A + beta*C,
+ *  where  alpha and beta  are scalars, C is an  n by n  syr2ketric matrix
+ *  and  A and B  are  n by k  matrices  in the  first  case  and  k by n
+ *  matrices in the second case.
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_syr2k(
+    char storage, char uplo, char transa, char transb,
+    gtint_t m, gtint_t k,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_syrk.h b/gtestsuite/testinghelpers/inc/level3/ref_syrk.h
new file mode 100644
index 0000000000..3d3b8765ae
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_syrk.h
@@ -0,0 +1,59 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ *  ==========================================================================
+ *  C := alpha*A*A**T + beta*C,
+ *       or
+ *  C := alpha*A**T*A + beta*C,
+ *  ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_syrk(
+    char storage, char uplo, char transa,
+    gtint_t m, gtint_t k,
+    T alpha,
+    T* ap, gtint_t lda,
+    T beta,
+    T* cp, gtint_t ldc
+);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_trmm.h b/gtestsuite/testinghelpers/inc/level3/ref_trmm.h
new file mode 100644
index 0000000000..f75b2356bc
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_trmm.h
@@ -0,0 +1,55 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * TRMM  performs one of the matrix-matrix operations
+ *    B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+ * where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+ * non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+ *    op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trmm( char storage, char side, char uploa, char transa, char diaga,
+    gtint_t m, gtint_t n, T alpha, T *ap, gtint_t lda, T *bp, gtint_t ldb );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_trmm3.h b/gtestsuite/testinghelpers/inc/level3/ref_trmm3.h
new file mode 100644
index 0000000000..975238050a
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_trmm3.h
@@ -0,0 +1,57 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * TRMM3  performs one of the matrix-matrix operations
+ *    C := beta * C_orig + alpha * transa(A) * transb(B)
+ * or
+ *    C := beta * C_orig + alpha * transb(B) * transa(A)
+ * where alpha and beta are scalars, A is an triangular matrix
+ * and  B and C are m by n matrices.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trmm3( char storage, char side, char uploa, char transa, char diaga,
+                char transb, gtint_t m, gtint_t n, T alpha, T *ap, gtint_t lda,
+                T *bp, gtint_t ldb, T beta, T *c, gtint_t ldc );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/level3/ref_trsm.h b/gtestsuite/testinghelpers/inc/level3/ref_trsm.h
new file mode 100644
index 0000000000..df57786f69
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/level3/ref_trsm.h
@@ -0,0 +1,56 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ *  TRSM  solves one of the matrix equations
+ *     op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+ *  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+ *  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+ *     op( A ) = A   or   op( A ) = A**T.
+ *  The matrix X is overwritten on B.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trsm( char storage, char side, char uploa, char transa, char diaga,
+    gtint_t m, gtint_t n, T alpha, T *ap, gtint_t lda, T *bp, gtint_t ldb );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/inc/util/ref_nrm2.h b/gtestsuite/testinghelpers/inc/util/ref_nrm2.h
new file mode 100644
index 0000000000..44c506b715
--- /dev/null
+++ b/gtestsuite/testinghelpers/inc/util/ref_nrm2.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * NRM2 returns the euclidean norm of a vector via the function
+ * name, so that
+ *    NRM2 := sqrt( x'*x ).
+ * ==========================================================================
+ */
+
+namespace testinghelpers {
+
+template <typename Tf, typename T>
+T ref_nrm2(gtint_t n, Tf* x, gtint_t incx);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/common/complex_helpers.cpp b/gtestsuite/testinghelpers/src/common/complex_helpers.cpp
new file mode 100644
index 0000000000..90158270dc
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/common/complex_helpers.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include <complex>
+#include "common/complex_helpers.h"
+
+namespace std {
+    // Overload std::abs to work with scomplex and dcomplex.
+    float abs(const scomplex x)
+    {
+        std::complex<float> y{x.real, x.imag};
+        return std::abs(y);
+    }
+    double abs(const dcomplex x)
+    {
+        std::complex<double> y{x.real, x.imag};
+        return std::abs(y);
+    }
+    // Overload the stream operator to be able to print scomplex in error messages.
+    ostream& operator<<(ostream& os, const scomplex& x)
+    {
+        os << "(" << x.real << ", " << x.imag <<")";
+        return os;
+    }
+    ostream& operator<<(ostream& os, const dcomplex& x)
+    {
+        os << "(" << x.real << ", " << x.imag <<")";
+        return os;
+    }
+}
+
+// Operator overloading for scomplex and dcomplex types.
+scomplex operator+(const scomplex x, const scomplex y)
+{
+    return scomplex{x.real+y.real, x.imag+y.imag};
+}
+dcomplex operator+(const dcomplex x, const dcomplex y)
+{
+    return dcomplex{x.real+y.real, x.imag+y.imag};
+}
+
+scomplex operator-(const scomplex x, const scomplex y)
+{
+    return scomplex{x.real-y.real, x.imag-y.imag};
+}
+dcomplex operator-(const dcomplex x, const dcomplex y)
+{
+    return dcomplex{x.real-y.real, x.imag-y.imag};
+}
+
+scomplex operator*(const scomplex x, const scomplex y)
+{
+    return scomplex{(( x.real * y.real ) - ( x.imag * y.imag )),(( x.real * y.imag ) + ( x.imag * y.real ))};
+}
+dcomplex operator*(const dcomplex x, const dcomplex y)
+{
+    return dcomplex{(( x.real * y.real ) - ( x.imag * y.imag )),(( x.real * y.imag ) + ( x.imag * y.real ))};
+}
+
+bool operator== (const scomplex x, const scomplex y)
+{
+    return {(x.real==y.real) && (x.imag==y.imag)};
+}
+bool operator== (const dcomplex x, const dcomplex y)
+{
+    return {(x.real==y.real) && (x.imag==y.imag)};
+}
+
+bool operator!= (const scomplex x, const scomplex y)
+{
+    return {!((x.real==y.real) && (x.imag==y.imag))};
+}
+bool operator!= (const dcomplex x, const dcomplex y)
+{
+    return {!((x.real==y.real) && (x.imag==y.imag))};
+}
diff --git a/gtestsuite/testinghelpers/src/common/data_generators.cpp b/gtestsuite/testinghelpers/src/common/data_generators.cpp
new file mode 100644
index 0000000000..c77af67cd5
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/common/data_generators.cpp
@@ -0,0 +1,496 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <random>
+#include "common/testing_helpers.h"
+
+namespace testinghelpers {
+namespace datagenerators {
+
+/***************************************************
+ *             Floating Point Generators
+****************************************************/
+/**
+ * @brief Returns a random fp type (float, double, scomplex, dcomplex)
+ *        that lies in the range [from, to].
+ *
+ * @param[in, out] alpha the random fp
+ */
+template<typename T>
+void getfp(int from, int to, T* alpha)
+{
+    using real_T = typename testinghelpers::type_info<T>::real_type;
+    std::mt19937                              generator(94);
+    std::uniform_real_distribution<real_T>    distr(from, to);
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        *alpha = distr(generator);
+    else
+        *alpha = {distr(generator), distr(generator)};
+}
+
+/**
+ * @brief Returns a random fp vector (float, double, scomplex, dcomplex)
+ *        with elements that follow a uniform distribution in the range [from, to].
+ * @param[in] n length of vector x
+ * @param[in] incx increments of vector x
+ * @param[in, out] x the random fp vector
+ */
+template<typename T>
+void getfp(int from, int to, gtint_t n, gtint_t incx, T* x)
+{
+    using real_T = typename testinghelpers::type_info<T>::real_type;
+    T* chi;
+    std::mt19937                              generator(94);
+    std::uniform_real_distribution<real_T>    distr(from, to);
+    for ( gtint_t i = 0; i < n; ++i )
+    {
+        chi = x + i*std::abs(incx);
+        if constexpr (testinghelpers::type_info<T>::is_real)
+            *chi = distr(generator);
+        else
+            *chi = {distr(generator), distr(generator)};
+    }
+}
+
+template<typename T>
+void getfp(int from, int to, char storage, gtint_t m, gtint_t n, T* a, gtint_t lda )
+{
+    T*    a_begin;
+    gtint_t inca;
+    gtint_t n_iter;
+    gtint_t n_elem;
+    gtint_t j;
+
+    // Initialize with optimal values for column-major storage.
+    inca   = 1;
+    n_iter = n;
+    n_elem = m;
+
+    // An optimization: if A is row-major, then let's access the matrix by
+    // rows instead of by columns for increased spatial locality.
+    if( (storage == 'r') || (storage == 'R') )
+    {
+        swap_dims( &n_iter, &n_elem );
+        swap_dims( &lda, &inca );
+    }
+
+    for ( j = 0; j < n_iter; j++ )
+    {
+        a_begin = a + j*lda;
+        getfp<T>( from, to, n_elem, inca, a_begin );
+    }
+}
+
+template<typename T>
+void getfp(int from, int to, char storage, gtint_t m, gtint_t n, T* a, char transa, gtint_t lda )
+{
+    using real_T = typename testinghelpers::type_info<T>::real_type;
+    std::mt19937                              generator(1994);
+    std::uniform_real_distribution<real_T>    distr(from, to);
+
+    if( chktrans( transa )) {
+       swap_dims( &m, &n );
+    }
+
+    if((storage == 'c') || (storage == 'C'))
+    {
+        for(gtint_t i=0; i<m; i++)
+        {
+            for(gtint_t j=0; j<n; j++)
+            {
+                if constexpr (testinghelpers::type_info<T>::is_real)
+                    a[i+j*lda] = real_T(distr(generator));
+                else
+                    a[i+j*lda] = {real_T(distr(generator)), real_T(distr(generator))};
+            }
+        }
+    }
+    else if( (storage == 'r') || (storage == 'R') )
+    {
+        for(gtint_t j=0; j<n; j++)
+        {
+            for(gtint_t i=0; i<m; i++)
+            {
+                if constexpr (testinghelpers::type_info<T>::is_real)
+                    a[j+i*lda] = real_T(distr(generator));
+                else
+                    a[j+i*lda] = {real_T(distr(generator)), real_T(distr(generator))};
+            }
+        }
+    }
+}
+
+/***************************************************
+ *              Integer Generators
+****************************************************/
+/**
+ * @brief Returns a random integer converted to an fp type (float, double, scomplex, dcomplex)
+ *        that lies in the range [from, to].
+ *
+ * @param[in, out] alpha the random fp
+ */
+template<typename T>
+void getint(int from, int to, T* alpha)
+{
+    using real_T = typename testinghelpers::type_info<T>::real_type;
+    std::mt19937                          generator(94);
+    std::uniform_int_distribution<int>    distr(from, to);
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        *alpha = real_T(distr(generator));
+    else
+        *alpha = {real_T(distr(generator)), real_T(distr(generator))};
+}
+/**
+ * @brief Returns a random fp vector (float, double, scomplex, dcomplex)
+ *        with elements that are integers and follow a uniform distribution in the range [from, to].
+ * @param[in] n length of vector x
+ * @param[in] incx increments of vector x
+ * @param[in, out] x the random fp vector
+ */
+template<typename T>
+void getint(int from, int to, gtint_t n, gtint_t incx, T* x)
+{
+    using real_T = typename testinghelpers::type_info<T>::real_type;
+    T* chi;
+    std::mt19937                          generator(94);
+    std::uniform_int_distribution<int>    distr(from, to);
+    for ( gtint_t i = 0; i < n; ++i )
+    {
+        chi = x + i*std::abs(incx);
+        if constexpr (testinghelpers::type_info<T>::is_real)
+            *chi = real_T(distr(generator));
+        else
+            *chi = {real_T(distr(generator)), real_T(distr(generator))};
+    }
+}
+
+template<typename T>
+void getint(int from, int to, char storage, gtint_t m, gtint_t n, T* a, gtint_t lda )
+{
+    T*    a_begin;
+    gtint_t inca;
+    gtint_t n_iter;
+    gtint_t n_elem;
+    gtint_t j;
+
+    // Initialize with optimal values for column-major storage.
+    inca   = 1;
+    n_iter = n;
+    n_elem = m;
+
+    // An optimization: if A is row-major, then let's access the matrix by
+    // rows instead of by columns for increased spatial locality.
+    if( (storage == 'r') || (storage == 'R') )
+    {
+        swap_dims( &n_iter, &n_elem );
+        swap_dims( &lda, &inca );
+    }
+
+    for ( j = 0; j < n_iter; j++ )
+    {
+        a_begin = a + j*lda;
+        getint<T>( from, to, n_elem, inca, a_begin );
+    }
+}
+
+/// @brief
+/// @tparam T
+/// @param from
+/// @param to
+/// @param storage
+/// @param m
+/// @param n
+/// @param a
+/// @param transa
+/// @param lda
+template<typename T>
+void getint(int from, int to, char storage, gtint_t m, gtint_t n, T* a, char transa, gtint_t lda )
+{
+    using real_T = typename testinghelpers::type_info<T>::real_type;
+    std::mt19937                          generator(1994);
+    std::uniform_int_distribution<int>    distr(from, to);
+
+    if( chktrans( transa )) {
+       swap_dims( &m, &n );
+    }
+
+    if((storage == 'c') || (storage == 'C'))
+    {
+        for(gtint_t i=0; i<m; i++)
+        {
+            for(gtint_t j=0; j<n; j++)
+            {
+                if constexpr (testinghelpers::type_info<T>::is_real)
+                    a[i+j*lda] = real_T(distr(generator));
+                else
+                    a[i+j*lda] = {real_T(distr(generator)), real_T(distr(generator))};
+            }
+        }
+    }
+    else if( (storage == 'r') || (storage == 'R') )
+    {
+        for(gtint_t j=0; j<n; j++)
+        {
+            for(gtint_t i=0; i<m; i++)
+            {
+                if constexpr (testinghelpers::type_info<T>::is_real)
+                    a[j+i*lda] = real_T(distr(generator));
+                else
+                    a[j+i*lda] = {real_T(distr(generator)), real_T(distr(generator))};
+            }
+        }
+    }
+}
+
+template<typename T>
+void randomgenerators( int from, int to, T* alpha, char datatype ) {
+
+    if( (datatype == 'i') ||(datatype == 'I') )
+        getint<T>( from, to, alpha );
+    else /*if( (datatype == 'f') ||(datatype == 'F') ) */
+        getfp<T>( from, to, alpha );
+}
+
+template<typename T>
+void randomgenerators(int from, int to, gtint_t n, gtint_t incx, T* x, char datatype ) {
+
+    if( (datatype == 'i') ||(datatype == 'I') )
+        getint<T>( from, to, n, incx, x );
+    else /*if( (datatype == 'f') ||(datatype == 'F') ) */
+        getfp<T>( from, to, n, incx, x );
+}
+
+template<typename T>
+void randomgenerators( int from, int to, char storage, gtint_t m, gtint_t n,
+     T* a, gtint_t lda, char datatype ) {
+
+    if( (datatype == 'i') ||(datatype == 'I') )
+        getint<T>( from, to, storage, m, n, a, lda );
+    else /*if( (datatype == 'f') ||(datatype == 'F') ) */
+        getfp<T>( from, to, storage, m, n, a, lda );
+}
+
+template<typename T>
+void randomgenerators( int from, int to, char storage, gtint_t m, gtint_t n,
+     T* a, char transa, gtint_t lda, char datatype ) {
+
+    if( (datatype == 'i') ||(datatype == 'I') )
+        getint<T>( from, to, storage, m, n, a, transa, lda );
+    else /*if( (datatype == 'f') ||(datatype == 'F') ) */
+        getfp<T>( from, to, storage, m, n, a, transa, lda );
+}
+
+template<typename T>
+void randomgenerators(int from, int to, char storage, char uplo, gtint_t k,
+                    T* a, gtint_t lda, char datatype) {
+    randomgenerators<T>(from, to, storage, k, k, a, lda, datatype);
+    if( (storage=='c')||(storage=='C') )
+    {
+        for(gtint_t j=0; j<k; j++)
+        {
+            for(gtint_t i=0; i<k; i++)
+            {
+                if( (uplo=='u')||(uplo=='U') )
+                {
+                    if(i>j) a[i+j*lda] = T{0};
+                }
+                else if ( (uplo=='l')||(uplo=='L') )
+                {
+                    if (i<j) a[i+j*lda] = T{0};
+                }
+                else
+                    throw std::runtime_error("Error in common/data_generators.cpp: side must be 'u' or 'l'.");
+            }
+        }
+    }
+    else
+    {
+        for(gtint_t i=0; i<k; i++)
+        {
+            for(gtint_t j=0; j<k; j++)
+            {
+                if( (uplo=='u')||(uplo=='U') )
+                {
+                    if(i>j) a[j+i*lda] = T{0};
+                }
+                else if ( (uplo=='l')||(uplo=='L') )
+                {
+                    if (i<j) a[j+i*lda] = T{0};
+                }
+                else
+                    throw std::runtime_error("Error in common/data_generators.cpp: side must be 'u' or 'l'.");
+            }
+        }
+    }
+}
+
+} //end of namespace datagenerators
+
+template<typename T>
+std::vector<T> get_random_matrix(int from, int to, char storage, char trans, gtint_t m, gtint_t n,
+                    gtint_t lda, char datatype)
+{
+    std::vector<T> a(matsize(storage, trans, m, n, lda));
+    testinghelpers::datagenerators::randomgenerators<T>( from, to, storage, m, n, a.data(), trans, lda, datatype );
+    return a;
+}
+template<typename T>
+std::vector<T> get_random_matrix(int from, int to, char storage, char uplo, gtint_t k, gtint_t lda, char datatype)
+{
+    // Create matrix for the given sizes.
+    std::vector<T> a( testinghelpers::matsize( storage, 'n', k, k, lda ) );
+    testinghelpers::datagenerators::randomgenerators( from, to, storage, uplo, k, a.data(), lda, datatype );
+    return a;
+}
+
+template<typename T>
+std::vector<T> get_random_vector(int from, int to, gtint_t n, gtint_t incx, char datatype)
+{
+    // Create vector for the given sizes.
+    std::vector<T> x( testinghelpers::buff_dim(n, incx) );
+    testinghelpers::datagenerators::randomgenerators( from, to, n, incx, x.data(), datatype );
+    return x;
+}
+
+
+
+template<typename T>
+void set_vector( gtint_t n, gtint_t incx, T* x, T value )
+{
+    T* chi;
+    for ( gtint_t i = 0; i < n; ++i )
+    {
+        chi = x + i*std::abs(incx);
+        *chi = value ;
+    }
+}
+
+template<typename T>
+void set_matrix( char storage, gtint_t m, gtint_t n, T* a, char transa, gtint_t lda, T value )
+{
+    if( chktrans( transa )) {
+       swap_dims( &m, &n );
+    }
+
+    if((storage == 'c') || (storage == 'C'))
+    {
+        for( gtint_t i = 0 ; i < m ; i++ )
+        {
+            for( gtint_t j = 0 ; j < n ; j++ )
+            {
+                a[i+j*lda] = value ;
+            }
+        }
+    }
+    else if( (storage == 'r') || (storage == 'R') )
+    {
+        for( gtint_t j = 0 ; j < n ; j++ )
+        {
+            for( gtint_t i = 0 ; i < m ; i++ )
+            {
+                a[j+i*lda] = value ;
+            }
+        }
+    }
+}
+
+template<typename T>
+std::vector<T> get_vector( gtint_t n, gtint_t incx, T value )
+{
+    // Create vector for the given sizes.
+    std::vector<T> x( testinghelpers::buff_dim(n, incx) );
+    testinghelpers::set_vector( n, incx, x.data(), value );
+    return x;
+}
+
+template<typename T>
+std::vector<T> get_matrix( char storage, char trans, gtint_t m, gtint_t n, gtint_t lda, T value )
+{
+    std::vector<T> a( matsize( storage, trans, m, n, lda ) );
+    testinghelpers::set_matrix<T>( storage, m, n, a.data(), trans, lda, value );
+    return a;
+}
+
+} //end of namespace testinghelpers
+
+// Explicit template instantiations
+template void testinghelpers::datagenerators::randomgenerators<float>(int, int, float*, char);
+template void testinghelpers::datagenerators::randomgenerators<double>(int, int, double*, char);
+template void testinghelpers::datagenerators::randomgenerators<scomplex>(int, int, scomplex*, char);
+template void testinghelpers::datagenerators::randomgenerators<dcomplex>(int, int, dcomplex*, char);
+
+template void testinghelpers::datagenerators::randomgenerators<float>(int, int, gtint_t, gtint_t, float*, char);
+template void testinghelpers::datagenerators::randomgenerators<double>(int, int, gtint_t, gtint_t, double*, char);
+template void testinghelpers::datagenerators::randomgenerators<scomplex>(int, int, gtint_t, gtint_t, scomplex*, char);
+template void testinghelpers::datagenerators::randomgenerators<dcomplex>(int, int, gtint_t, gtint_t, dcomplex*, char);
+
+template void testinghelpers::datagenerators::randomgenerators<float>(int, int, char, gtint_t, gtint_t, float*, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<double>(int, int, char, gtint_t, gtint_t, double*, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<scomplex>(int, int, char, gtint_t, gtint_t, scomplex*, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<dcomplex>(int, int, char, gtint_t, gtint_t, dcomplex*, gtint_t, char);
+
+template void testinghelpers::datagenerators::randomgenerators<float>(int, int, char, gtint_t, gtint_t, float*, char, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<double>(int, int, char, gtint_t, gtint_t, double*, char, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<scomplex>(int, int, char, gtint_t, gtint_t, scomplex*, char, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<dcomplex>(int, int, char, gtint_t, gtint_t, dcomplex*, char, gtint_t, char);
+
+template void testinghelpers::datagenerators::randomgenerators<float>(int, int, char, char, gtint_t, float*, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<double>(int, int, char, char, gtint_t, double*, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<scomplex>(int, int, char, char, gtint_t, scomplex*, gtint_t, char);
+template void testinghelpers::datagenerators::randomgenerators<dcomplex>(int, int, char, char, gtint_t, dcomplex*, gtint_t, char);
+
+template std::vector<float> testinghelpers::get_random_matrix(int, int, char, char, gtint_t, gtint_t, gtint_t, char);
+template std::vector<double> testinghelpers::get_random_matrix(int, int, char, char, gtint_t, gtint_t, gtint_t, char);
+template std::vector<scomplex> testinghelpers::get_random_matrix(int, int, char, char, gtint_t, gtint_t, gtint_t, char);
+template std::vector<dcomplex> testinghelpers::get_random_matrix(int, int, char, char, gtint_t, gtint_t, gtint_t, char);
+
+template std::vector<float> testinghelpers::get_random_matrix(int, int, char, char, gtint_t, gtint_t, char);
+template std::vector<double> testinghelpers::get_random_matrix(int, int, char, char, gtint_t, gtint_t, char);
+template std::vector<scomplex> testinghelpers::get_random_matrix(int, int, char, char, gtint_t, gtint_t, char);
+template std::vector<dcomplex> testinghelpers::get_random_matrix(int, int, char, char, gtint_t, gtint_t, char);
+
+template std::vector<float> testinghelpers::get_random_vector(int, int, gtint_t, gtint_t, char);
+template std::vector<double> testinghelpers::get_random_vector(int, int, gtint_t, gtint_t, char);
+template std::vector<scomplex> testinghelpers::get_random_vector(int, int, gtint_t, gtint_t, char);
+template std::vector<dcomplex> testinghelpers::get_random_vector(int, int, gtint_t, gtint_t, char);
+
+template std::vector<float> testinghelpers::get_vector(gtint_t, gtint_t, float);
+template std::vector<double> testinghelpers::get_vector(gtint_t, gtint_t, double);
+template std::vector<scomplex> testinghelpers::get_vector(gtint_t, gtint_t, scomplex);
+template std::vector<dcomplex> testinghelpers::get_vector(gtint_t, gtint_t, dcomplex);
+
+template std::vector<float> testinghelpers::get_matrix( char, char, gtint_t, gtint_t, gtint_t, float );
+template std::vector<double> testinghelpers::get_matrix( char, char, gtint_t, gtint_t, gtint_t, double );
+template std::vector<scomplex> testinghelpers::get_matrix( char, char, gtint_t, gtint_t, gtint_t, scomplex );
+template std::vector<dcomplex> testinghelpers::get_matrix( char, char, gtint_t, gtint_t, gtint_t, dcomplex );
diff --git a/gtestsuite/testinghelpers/src/common/refCBLAS.cpp b/gtestsuite/testinghelpers/src/common/refCBLAS.cpp
new file mode 100644
index 0000000000..533fd2e356
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/common/refCBLAS.cpp
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <iostream>
+#ifdef REF_IS_MKL
+#include <omp.h>
+#endif
+#include "common/refCBLAS.h"
+
+namespace testinghelpers {
+refCBLAS::refCBLAS() {
+    std::cout << "refCBLAS constructor\n";
+    if (!refCBLASModule)
+    {
+#ifdef REF_IS_MKL
+        // Dummy call to force linker, link OpenMP library if MKL is used.
+        omp_get_num_threads();
+        MKLCoreModule = dlopen(MKL_CORE, RTLD_GLOBAL | RTLD_LAZY);
+        MKLGNUThreadModule = dlopen(MKL_GNU_THREAD, RTLD_GLOBAL | RTLD_LAZY);
+#endif
+#ifdef ENABLE_ASAN
+        refCBLASModule = dlopen(REFERENCE_BLAS, RTLD_LOCAL | RTLD_LAZY);
+#else
+        refCBLASModule = dlopen(REFERENCE_BLAS, RTLD_DEEPBIND | RTLD_LAZY);
+#endif
+    }
+
+    if (refCBLASModule == nullptr)
+    {
+      std::cout<<dlerror();
+      throw std::runtime_error("Reference Library cannot be found. LIB_PATH=" REFERENCE_BLAS );
+    }
+}
+
+refCBLAS::~refCBLAS() {
+    std::cout << "refCBLAS destructor\n" <<std::endl;
+#ifdef REF_IS_MKL
+    dlclose(MKLCoreModule);
+    dlclose(MKLGNUThreadModule);
+#endif
+    dlclose(refCBLASModule);
+}
+void* refCBLAS::get() { return refCBLASModule; }
+} //end of testinghelpers namespace
+
+thread_local testinghelpers::refCBLAS refCBLASModule;
diff --git a/gtestsuite/testinghelpers/src/common/testing_basics.cpp b/gtestsuite/testinghelpers/src/common/testing_basics.cpp
new file mode 100644
index 0000000000..2d07072716
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/common/testing_basics.cpp
@@ -0,0 +1,329 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "common/testing_basics.h"
+#include "common/type_info.h"
+
+namespace testinghelpers {
+
+/**
+ * Function that tests the compatibility of integer types.
+ */
+void int_compatibility(){
+#if TEST_BLIS_TYPED
+    static_assert(sizeof(gtint_t)==sizeof(dim_t),"Mismatch of integer types.");
+#else
+    static_assert(sizeof(gtint_t)==sizeof(f77_int),"Mismatch of integer types.");
+#endif
+}
+
+void char_to_blis_trans( char trans, trans_t* blis_trans )
+{
+    if      ( trans == 'n' || trans == 'N' ) *blis_trans = BLIS_NO_TRANSPOSE;
+    else if ( trans == 't' || trans == 'T' ) *blis_trans = BLIS_TRANSPOSE;
+    else if ( trans == 'c' || trans == 'C' ) *blis_trans = BLIS_CONJ_TRANSPOSE;
+    else if ( trans == 'h' || trans == 'H' )
+    {
+        throw std::invalid_argument("Error in file src/common/testing_basics.cpp in function char_to_blis_trans(): trans == 'h'. "
+                    "To test BLIS-typed interface for this parameter be aware that this is not "
+                    "a BLAS/CBLAS option. In BLAS/CBLAS interface 'c' is conjugate transpose (Hermitian), "
+                    "while in BLIS_typed this would be 'h'. "
+                    "To implement this option, please modify ref_*.cpp to use the correct matrix.");
+    }
+}
+
+void char_to_blis_conj( char conj, conj_t* blis_conj )
+{
+    if      ( conj == 'n' || conj == 'N' ) *blis_conj = BLIS_NO_CONJUGATE;
+    else if ( conj == 'c' || conj == 'C' ) *blis_conj = BLIS_CONJUGATE;
+}
+
+void char_to_blis_side( char side, side_t* blis_side )
+{
+    if      ( side == 'l' || side == 'L' ) *blis_side = BLIS_LEFT;
+    else if ( side == 'r' || side == 'R' ) *blis_side = BLIS_RIGHT;
+}
+
+void char_to_blis_uplo( char uplo, uplo_t* blis_uplo )
+{
+    if      ( uplo == 'l' || uplo == 'L' ) *blis_uplo = BLIS_LOWER;
+    else if ( uplo == 'u' || uplo == 'U' ) *blis_uplo = BLIS_UPPER;
+}
+
+void char_to_blis_diag( char diag, diag_t* blis_diag )
+{
+    if      ( diag == 'n' || diag == 'N' ) *blis_diag = BLIS_NONUNIT_DIAG;
+    else if ( diag == 'u' || diag == 'U' ) *blis_diag = BLIS_UNIT_DIAG;
+}
+
+void char_to_cblas_order( char order, CBLAS_ORDER *cblas_order )
+{
+    if      ( order == 'c' || order == 'C' ) *cblas_order = CblasColMajor;
+    else if ( order == 'r' || order == 'R' ) *cblas_order = CblasRowMajor;
+
+}
+
+void char_to_cblas_trans( char trans, CBLAS_TRANSPOSE *cblas_trans )
+{
+    if      ( trans == 'n' || trans == 'N' ) *cblas_trans = CBLAS_TRANSPOSE::CblasNoTrans;
+    else if ( trans == 't' || trans == 'T' ) *cblas_trans = CBLAS_TRANSPOSE::CblasTrans;
+    else if ( trans == 'c' || trans == 'C' ) *cblas_trans = CBLAS_TRANSPOSE::CblasConjTrans;
+}
+
+void char_to_cblas_uplo( char uplo, CBLAS_UPLO *cblas_uplo )
+{
+    if      ( uplo == 'l' || uplo == 'L' ) *cblas_uplo = CblasLower;
+    else if ( uplo == 'u' || uplo == 'U' ) *cblas_uplo = CblasUpper;
+}
+
+void char_to_cblas_diag( char diag, CBLAS_DIAG *cblas_diag )
+{
+    if      ( diag == 'n' || diag == 'N' ) *cblas_diag = CblasNonUnit;
+    else if ( diag == 'u' || diag == 'U' ) *cblas_diag = CblasUnit;
+}
+
+void char_to_cblas_side( char side, CBLAS_SIDE *cblas_side )
+{
+    if      ( side == 'l' || side == 'L' ) *cblas_side = CblasLeft;
+    else if ( side == 'r' || side == 'R' ) *cblas_side = CblasRight;
+}
+
+/**
+ * @brief Returns the size of a buffer which has strides.
+ *
+ * @param n length of vector
+ * @param incx increment
+ * @return gtint_t dimension of the buffer that stored a vector with length n and increment incx
+ */
+gtint_t buff_dim( gtint_t n, gtint_t incx ) {
+    return (n*std::abs(incx) - (std::abs(incx)-1));
+}
+
+gtint_t matsize( char storage, char trans, gtint_t m, gtint_t n, gtint_t ldm )
+{
+    gtint_t km;
+    if( (storage == 'c') || (storage == 'C') ) {
+        /*Column_Major*/
+        km  = chktrans( trans ) ? m : n ;
+    }
+    else {
+        /*Row_Major*/
+        km  = chktrans( trans ) ? n : m ;
+    }
+    return (km*ldm);
+}
+
+/**
+ * Returns the leading dimension of a matrix depending on the storage type,
+ * whether it is transpose or not, and the size of rows and columns.
+ *
+ * @param storage specifies the storage format of matrix in memory.
+ * @param trns    specifies the form of given matrix.
+ * @param m       specifies the number of rows of given matrix.
+ * @param n       specifies the number of columns of given matrix.
+ * @param inc     specifies the increment of the leading dimension.
+*/
+gtint_t get_leading_dimension( char storage, char trans, gtint_t m, gtint_t n, gtint_t inc )
+{
+    gtint_t lda;
+    if( (storage == 'c') || (storage == 'C') ) //column-major order
+    {
+        if ((trans == 'n')||(trans == 'N'))
+            lda = std::max(gtint_t(1),m) + inc;
+        else
+            lda = std::max(gtint_t(1),n) + inc;
+    }
+    else //row-major order
+    {
+        if ((trans == 'n')||(trans == 'N'))
+            lda = std::max(gtint_t(1),n) + inc;
+        else
+            lda = std::max(gtint_t(1),m) + inc;
+    }
+    return lda;
+}
+
+/**
+ * If T is real, returns NaN.
+ * If T is complex, returns {NaN, 0.0}
+*/
+template<typename T>
+T getNaN()
+{
+    using RT = typename testinghelpers::type_info<T>::real_type;
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        return std::numeric_limits<RT>::quiet_NaN();
+    else
+        return T{std::numeric_limits<RT>::quiet_NaN(), 0};
+}
+template float getNaN<float>();
+template double getNaN<double>();
+template scomplex getNaN<scomplex>();
+template dcomplex getNaN<dcomplex>();
+
+/**
+ * If T is real, returns inf.
+ * If T is complex, returns {inf, 0.0}
+*/
+template<typename T>
+T getInf()
+{
+    using RT = typename testinghelpers::type_info<T>::real_type;
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        return std::numeric_limits<RT>::infinity();
+    else
+        return T{std::numeric_limits<RT>::infinity(), 0};
+}
+template float getInf<float>();
+template double getInf<double>();
+template scomplex getInf<scomplex>();
+template dcomplex getInf<dcomplex>();
+
+
+
+bool chktrans( char trns )
+{
+    return (!(trns=='n'));
+}
+
+bool chknotrans( char trns )
+{
+    trans_t trans;
+    char_to_blis_trans( trns, &trans );
+    return ( bool )
+	       ( ( trans & BLIS_TRANS_BIT ) == BLIS_BITVAL_NO_TRANS );
+}
+
+bool chkconjtrans( char trns )
+{
+    trans_t trans;
+    char_to_blis_trans( trns, &trans );
+    return ( bool )
+	       ( ( ( trans & BLIS_CONJ_BIT ) & ( trans & BLIS_TRANS_BIT ) ) == BLIS_BITVAL_CONJ_TRANS );
+}
+
+bool chktransconj( char trns )
+{
+    trans_t trans;
+    char_to_blis_trans( trns, &trans );
+    return ( bool )
+        ( ( trans & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ );
+}
+
+bool chkconj( char conjx )
+{
+    conj_t conj;
+    char_to_blis_conj( conjx, &conj );
+    return ( bool )
+        ( ( conj & BLIS_CONJ_BIT ) == BLIS_BITVAL_CONJ );
+}
+
+bool chkupper( char uplo )
+{
+    uplo_t uploa;
+    char_to_blis_uplo( uplo, &uploa );
+    return ( bool ) ( uploa == BLIS_UPPER );
+}
+
+bool chklower( char uplo )
+{
+    uplo_t uploa;
+    char_to_blis_uplo( uplo, &uploa );
+    return ( bool ) ( uploa == BLIS_LOWER );
+}
+
+bool chkunitdiag( char diag )
+{
+    diag_t diaga;
+    char_to_blis_diag( diag, &diaga );
+    return ( bool ) ( diaga == BLIS_BITVAL_UNIT_DIAG );
+}
+
+bool chknonunitdiag( char diag )
+{
+    diag_t diaga;
+    char_to_blis_diag( diag, &diaga );
+    return ( bool ) ( diaga == BLIS_BITVAL_NONUNIT_DIAG );
+}
+
+bool chksideleft( char mside )
+{
+    side_t  side;
+    char_to_blis_side( mside, &side );
+    return ( bool ) ( side == BLIS_LEFT );
+}
+
+bool chksideright( char mside )
+{
+    side_t  side;
+    char_to_blis_side( mside, &side );
+    return ( bool ) ( side == BLIS_RIGHT );
+}
+
+void swap_dims_with_trans( char trans,
+                           gtint_t  m,  gtint_t  n,  gtint_t  rs,  gtint_t  cs,
+                           gtint_t* mt, gtint_t* nt, gtint_t* rst, gtint_t* cst )
+{
+    if ( chktrans( trans ) ) { *mt = n; *nt = m; *rst = cs; *cst = rs; }
+    else                     { *mt = m; *nt = n; *rst = rs; *cst = cs; }
+}
+
+void swap_strides_with_trans( char trans,
+                              gtint_t  rs,  gtint_t  cs,
+                              gtint_t* rst, gtint_t* cst )
+{
+    if ( chktrans( trans ) ) {*rst = cs; *cst = rs; }
+    else                     {*rst = rs; *cst = cs; }
+}
+
+void swap_dims( gtint_t* x, gtint_t* y )
+{
+    gtint_t temp = *x;
+    *x = *y;
+    *y = temp;
+}
+
+void set_dims( char trans, gtint_t m, gtint_t n, gtint_t* mt, gtint_t* nt )
+{
+   if ( chktrans( trans ) ) { *mt = n; *nt = m; }
+   else                     { *mt = m; *nt = n; }
+}
+
+void set_dim_with_side( char side, gtint_t m, gtint_t n, gtint_t* dim )
+{
+    if ( chksideleft( side ) ) *dim = m;
+    else                       *dim = n;
+}
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_addv.cpp b/gtestsuite/testinghelpers/src/level1/ref_addv.cpp
new file mode 100644
index 0000000000..90351b0ec2
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_addv.cpp
@@ -0,0 +1,99 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_addv.h"
+
+namespace testinghelpers {
+
+// Since addv is not a BLAS/CBLAS interface we use axpy as a reference.
+template<typename T>
+void ref_addv( char conj_x, gtint_t n, const T* x, gtint_t incx,
+                                             T* y, gtint_t incy ) {
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_axpy)( f77_int, scalar_t , const T *, f77_int , T *, f77_int );
+    Fptr_ref_cblas_axpy ref_cblas_axpy;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get( ), "cblas_saxpy");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_daxpy");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_caxpy");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_zaxpy");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_addv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_axpy) {
+        throw std::runtime_error("Error in ref_addv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    T one;
+    testinghelpers::initone(one);
+    // Since conjx is not an option in BLAS/CBLAS,
+    // we create a temporary xc which holds conj(x).
+    if( chkconj( conj_x ) )
+    {
+        std::vector<T> X( testinghelpers::buff_dim(n, incx) );
+        memcpy( X.data(), x, testinghelpers::buff_dim(n, incx)*sizeof(T) );
+        testinghelpers::conj<T>( X.data(), n, incx );
+        ref_cblas_axpy( n, one, X.data(), incx, y, incy );
+    }
+    else
+    {
+        ref_cblas_axpy( n, one, x, incx, y, incy );
+    }
+}
+
+
+// Explicit template instantiations
+template void ref_addv<float>(char, gtint_t, const float*, gtint_t, float*, gtint_t);
+template void ref_addv<double>(char, gtint_t, const double*, gtint_t, double*, gtint_t);
+template void ref_addv<scomplex>(char, gtint_t, const scomplex*, gtint_t, scomplex*, gtint_t);
+template void ref_addv<dcomplex>(char, gtint_t, const dcomplex*, gtint_t, dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_amaxv.cpp b/gtestsuite/testinghelpers/src/level1/ref_amaxv.cpp
new file mode 100644
index 0000000000..4ad4610eb9
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_amaxv.cpp
@@ -0,0 +1,85 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_amaxv.h"
+
+namespace testinghelpers {
+
+// Since amaxv is not a BLAS/CBLAS interface we use axpy as a reference.
+template<typename T>
+gtint_t ref_amaxv( gtint_t n, const T* x, gtint_t incx ) {
+    gtint_t idx;
+    typedef gtint_t (*Fptr_ref_cblas_amaxv)( f77_int, const T *, f77_int );
+    Fptr_ref_cblas_amaxv ref_cblas_amaxv;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)dlsym(refCBLASModule.get( ), "cblas_isamax");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)dlsym(refCBLASModule.get(), "cblas_idamax");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)dlsym(refCBLASModule.get(), "cblas_icamax");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_amaxv = (Fptr_ref_cblas_amaxv)dlsym(refCBLASModule.get(), "cblas_izamax");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_amaxv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_amaxv) {
+        throw std::runtime_error("Error in ref_amaxv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    idx = ref_cblas_amaxv( n, x, incx );
+    return idx;
+}
+
+
+// Explicit template instantiations
+template gtint_t ref_amaxv<float>(gtint_t, const float*, gtint_t);
+template gtint_t ref_amaxv<double>(gtint_t, const double*, gtint_t);
+template gtint_t ref_amaxv<scomplex>(gtint_t, const scomplex*, gtint_t);
+template gtint_t ref_amaxv<dcomplex>(gtint_t, const dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_axpbyv.cpp b/gtestsuite/testinghelpers/src/level1/ref_axpbyv.cpp
new file mode 100644
index 0000000000..2f6f64ec60
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_axpbyv.cpp
@@ -0,0 +1,176 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_axpbyv.h"
+
+namespace testinghelpers {
+
+#if !defined(REF_IS_OPENBLAS) || !defined(REF_IS_MKL)
+template<typename T>
+void ref_axpbyv( char conj_x, gtint_t n, T alpha, const T* x,
+                    gtint_t incx, T beta, T* y, gtint_t incy )
+{
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_scal)( f77_int, scalar_t , const T *, f77_int);
+    Fptr_ref_cblas_scal ref_cblas_scal;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_sscal");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_dscal");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_cscal");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_zscal");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_axpby.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_scal) {
+        throw std::runtime_error("Error in ref_axpby.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_scal( n, beta, y, incy );
+    typedef void (*Fptr_ref_cblas_axpby)( f77_int, scalar_t , const T *, f77_int , T *, f77_int );
+    Fptr_ref_cblas_axpby ref_cblas_axpby;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_saxpy");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_daxpy");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_caxpy");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_zaxpy");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_axpby.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_axpby) {
+        throw std::runtime_error("Error in ref_axpby.cpp: Function pointer == 0 -- symbol not found.");
+    }
+#ifdef TEST_BLIS_TYPED
+    if( chkconj( conj_x ) )
+    {
+        std::vector<T> X( testinghelpers::buff_dim(n, incx) );
+        memcpy( X.data(), x, testinghelpers::buff_dim(n, incx)*sizeof(T) );
+        testinghelpers::conj<T>( X.data(), n, incx );
+        ref_cblas_axpby( n, alpha, X.data(), incx, y, incy );
+    }
+    else
+#endif
+    {
+        ref_cblas_axpby( n, alpha, x, incx, y, incy );
+    }
+
+}
+#else
+template<typename T>
+void ref_axpbyv( char conj_x, gtint_t n, T alpha, const T* x,
+                    gtint_t incx, T beta, T* y, gtint_t incy ) {
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_axpby)( f77_int, scalar_t , const T *, f77_int , scalar_t, T *, f77_int );
+    Fptr_ref_cblas_axpby ref_cblas_axpby;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_saxpby");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_daxpby");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_caxpby");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_zaxpby");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_axpby.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_axpby) {
+        throw std::runtime_error("Error in ref_axpby.cpp: Function pointer == 0 -- symbol not found.");
+    }
+#ifdef TEST_BLIS_TYPED
+    if( chkconj( conj_x ) )
+    {
+        std::vector<T> X( testinghelpers::buff_dim(n, incx) );
+        memcpy( X.data(), x, testinghelpers::buff_dim(n, incx)*sizeof(T) );
+        testinghelpers::conj<T>( X.data(), n, incx );
+        ref_cblas_axpby( n, alpha, X.data(), incx, beta, y, incy );
+    }
+    else
+#endif
+    {
+        ref_cblas_axpby( n, alpha, x, incx, beta, y, incy );
+    }
+}
+#endif
+
+// Explicit template instantiations
+template void ref_axpbyv<float>(char, gtint_t, float, const float*, gtint_t, float, float*, gtint_t);
+template void ref_axpbyv<double>(char, gtint_t, double, const double*, gtint_t, double, double*, gtint_t);
+template void ref_axpbyv<scomplex>(char, gtint_t, scomplex, const scomplex*, gtint_t, scomplex, scomplex*, gtint_t);
+template void ref_axpbyv<dcomplex>(char, gtint_t, dcomplex, const dcomplex*, gtint_t, dcomplex, dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_axpyv.cpp b/gtestsuite/testinghelpers/src/level1/ref_axpyv.cpp
new file mode 100644
index 0000000000..c5541ca86a
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_axpyv.cpp
@@ -0,0 +1,96 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_axpyv.h"
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_axpyv( char conj_x, gtint_t n, T alpha,
+                        const T* x, gtint_t incx, T* y, gtint_t incy ) {
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_axpy)( f77_int, scalar_t , const T *, f77_int , T *, f77_int );
+    Fptr_ref_cblas_axpy ref_cblas_axpy;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_saxpy");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_daxpy");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_caxpy");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_axpy = (Fptr_ref_cblas_axpy)dlsym(refCBLASModule.get(), "cblas_zaxpy");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_axpy.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_axpy) {
+        throw std::runtime_error("Error in ref_axpy.cpp: Function pointer == 0 -- symbol not found.");
+    }
+#if TEST_BLIS_TYPED
+    if( chkconj( conj_x ) )
+    {
+        std::vector<T> X( testinghelpers::buff_dim(n, incx) );
+        memcpy( X.data(), x, testinghelpers::buff_dim(n, incx)*sizeof(T) );
+        testinghelpers::conj<T>( X.data(), n, incx );
+        ref_cblas_axpy( n, alpha, X.data(), incx, y, incy );
+    }
+    else
+#endif
+    {
+        ref_cblas_axpy( n, alpha, x, incx, y, incy );
+    }
+}
+
+
+// Explicit template instantiations
+template void ref_axpyv<float>(char, gtint_t, float, const float*, gtint_t, float*, gtint_t);
+template void ref_axpyv<double>(char, gtint_t, double, const double*, gtint_t, double*, gtint_t);
+template void ref_axpyv<scomplex>(char, gtint_t, scomplex, const scomplex*, gtint_t, scomplex*, gtint_t);
+template void ref_axpyv<dcomplex>(char, gtint_t, dcomplex, const dcomplex*, gtint_t, dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_copyv.cpp b/gtestsuite/testinghelpers/src/level1/ref_copyv.cpp
new file mode 100644
index 0000000000..90b70a2bab
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_copyv.cpp
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_copyv.h"
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_copyv( char conj_x, gtint_t n, const T* xp, gtint_t incx,
+                                              T* yp, gtint_t incy ) {
+
+    typedef void (*Fptr_ref_cblas_copyv)(f77_int, const T*, f77_int, T*, f77_int);
+    Fptr_ref_cblas_copyv ref_cblas_copyv;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_copyv = (Fptr_ref_cblas_copyv)dlsym(refCBLASModule.get(), "cblas_scopy");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_copyv = (Fptr_ref_cblas_copyv)dlsym(refCBLASModule.get(), "cblas_dcopy");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_copyv = (Fptr_ref_cblas_copyv)dlsym(refCBLASModule.get(), "cblas_ccopy");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_copyv = (Fptr_ref_cblas_copyv)dlsym(refCBLASModule.get(), "cblas_zcopy");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_copyv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_copyv) {
+        throw std::runtime_error("Error in ref_copyv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    // Since conjx is not an option in BLAS/CBLAS,
+    // we create a temporary xc which holds conj(x).
+    if( chkconj( conj_x ) )
+    {
+        std::vector<T> X( testinghelpers::buff_dim(n, incx) );
+        memcpy( X.data(), xp, testinghelpers::buff_dim(n, incx)*sizeof(T) );
+        testinghelpers::conj<T>( X.data(), n, incx );
+        ref_cblas_copyv( n, X.data(), incx, yp, incy );
+    }
+    else
+    {
+        ref_cblas_copyv( n, xp, incx, yp, incy );
+    }
+}
+
+// Explicit template instantiations
+template void ref_copyv<float>(char, gtint_t, const float*, gtint_t, float*, gtint_t);
+template void ref_copyv<double>(char, gtint_t, const double*, gtint_t, double*, gtint_t);
+template void ref_copyv<scomplex>(char, gtint_t, const scomplex*, gtint_t, scomplex*, gtint_t);
+template void ref_copyv<dcomplex>(char, gtint_t, const dcomplex*, gtint_t, dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_dotv.cpp b/gtestsuite/testinghelpers/src/level1/ref_dotv.cpp
new file mode 100644
index 0000000000..d7a098a1bf
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_dotv.cpp
@@ -0,0 +1,124 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_dotv.h"
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_dotv(gtint_t len, const T* xp,
+              gtint_t incx, const T* yp, gtint_t incy, T* rho) {
+
+    typedef T (*Fptr_ref_cblas_dot)(f77_int, const T*, f77_int, const T*, f77_int );
+    Fptr_ref_cblas_dot ref_cblas_dot;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_dot = (Fptr_ref_cblas_dot)dlsym(refCBLASModule.get(), "cblas_sdot");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_dot = (Fptr_ref_cblas_dot)dlsym(refCBLASModule.get(), "cblas_ddot");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_dot.cpp: Invalid typename is passed function template.");
+    }
+    if ( !ref_cblas_dot ) {
+        throw std::runtime_error("Error in ref_dot.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    *rho = ref_cblas_dot( len, xp, incx, yp, incy );
+
+}
+
+template<typename T>
+void ref_dotv( char conj_x, char conj_y, gtint_t len, const T* xp, gtint_t incx,
+                                             const T* yp, gtint_t incy, T* rho ) {
+
+    typedef void (*Fptr_ref_cblas_dot)(f77_int, const T*, f77_int, const T*, f77_int, T* );
+    Fptr_ref_cblas_dot ref_cblas_dot;
+
+    bool  cfx = chkconj( conj_x );
+    bool  cfy = chkconj( conj_y );
+    gtint_t svx = buff_dim(len, incx);
+    gtint_t svy = buff_dim(len, incy);
+
+    std::vector<T> X( svx );
+    memcpy(X.data(), xp, svx*sizeof(T));
+
+    std::vector<T> Y( svy );
+    memcpy(Y.data(), yp, svy*sizeof(T));
+
+    if( cfx ) {
+        conj<T>( X.data(), len, incx );
+    }
+
+    if( cfy ) {
+        conj<T>( Y.data(), len, incy );
+    }
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_dot = (Fptr_ref_cblas_dot)dlsym(refCBLASModule.get(), "cblas_cdotu_sub");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_dot = (Fptr_ref_cblas_dot)dlsym(refCBLASModule.get(), "cblas_zdotu_sub");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_dot.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_dot) {
+        throw std::runtime_error("Error in ref_dot.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_dot( len, X.data(), incx, Y.data(), incy, rho );
+
+}
+
+// Explicit template instantiations
+template void ref_dotv<float>( gtint_t, const float*, gtint_t, const float*, gtint_t, float* );
+template void ref_dotv<double>( gtint_t, const double*, gtint_t, const double*, gtint_t,double* );
+template void ref_dotv<scomplex>(char, char, gtint_t, const scomplex*, gtint_t, const scomplex*, gtint_t, scomplex*);
+template void ref_dotv<dcomplex>(char, char, gtint_t, const dcomplex*, gtint_t, const dcomplex*, gtint_t, dcomplex*);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_dotxv.cpp b/gtestsuite/testinghelpers/src/level1/ref_dotxv.cpp
new file mode 100644
index 0000000000..b3cdf476ab
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_dotxv.cpp
@@ -0,0 +1,112 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_dotxv.h"
+
+namespace testinghelpers {
+
+// Since dotxv is not supported by BLAS, we have a local reference implementation.
+template<typename T>
+void ref_dotxv( char conj_x, char conj_y, gtint_t len, const T Alpha,
+    const T* xp, gtint_t incx, const T* yp, gtint_t incy, const T Beta,
+    T* rhorig )
+{
+    gtint_t i, ix, iy;
+    T ONE, ZERO;
+    initone(ONE);
+    initzero(ZERO);
+    bool  cfx = chkconj( conj_x );
+    bool  cfy = chkconj( conj_y );
+    gtint_t svx = buff_dim(len, incx);
+    gtint_t svy = buff_dim(len, incy);
+    T rho   = *rhorig;
+
+    if (len == 0) {
+        *rhorig = rho;
+        return;
+    }
+
+    rho = rho * Beta;
+
+    std::vector<T> X( svx );
+    memcpy(X.data(), xp, svx*sizeof(T));
+
+    std::vector<T> Y( svy );
+    memcpy(Y.data(), yp, svy*sizeof(T));
+
+    if( cfx ) {
+        conj<T>( X.data(), len, incx );
+    }
+
+    if (Alpha != ONE) {
+        ix = 0;
+        if (Alpha == ZERO) {
+            for(i = 0 ; i < len ; i++) {
+                X[ix] = ZERO;
+                ix = ix + incx;
+            }
+        }
+        else {
+            for(i = 0 ; i < len ; i++) {
+                X[ix] = Alpha * X[ix];
+                ix = ix + incx;
+            }
+        }
+    }
+
+    if( cfy ) {
+        conj<T>( Y.data(), len, incy );
+    }
+
+    ix = 0;
+    iy = 0;
+    for(i = 0 ; i < len ; i++) {
+        rho = rho + X[ix] * Y[iy];
+        ix  = ix + incx;
+        iy  = iy + incy;
+    }
+
+    *rhorig = rho;
+    return;
+}
+
+// Explicit template instantiations
+template void ref_dotxv<float>(char, char, gtint_t, const float, const float*, gtint_t, const float*, gtint_t, const float, float*);
+template void ref_dotxv<double>(char, char, gtint_t, const double, const double*, gtint_t, const double*, gtint_t, const double, double*);
+template void ref_dotxv<scomplex>(char, char, gtint_t, const scomplex, const scomplex*, gtint_t, const scomplex*, gtint_t, const scomplex, scomplex*);
+template void ref_dotxv<dcomplex>(char, char, gtint_t, const dcomplex, const dcomplex*, gtint_t, const dcomplex*, gtint_t, const dcomplex, dcomplex*);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_scal2v.cpp b/gtestsuite/testinghelpers/src/level1/ref_scal2v.cpp
new file mode 100644
index 0000000000..8d9e59a86d
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_scal2v.cpp
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_scalv.h"
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_scal2v(char conjx, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_scal)( f77_int, scalar_t , T *, f77_int);
+    Fptr_ref_cblas_scal ref_cblas_scal;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_sscal");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_dscal");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_cscal");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_zscal");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_scal2v.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_scal) {
+        throw std::runtime_error("Error in ref_scal2v.cpp: Function pointer == 0 -- symbol not found.");
+    }
+    // First use a temporary to pass in scal since we need to leave x unchanged
+    std::vector<T> z( testinghelpers::buff_dim(n, incx) );
+    memcpy( z.data(), x, testinghelpers::buff_dim(n, incx)*sizeof(T) );
+    if( chkconj( conjx ) )
+    {
+        testinghelpers::conj<T>( z.data(), n, incx );
+    }
+    ref_cblas_scal( n, alpha, z.data(), incx );
+    gtint_t idx = 0, idy = 0;
+    for (gtint_t i=0; i<n; i++){
+        idx = (incx > 0) ? (i * incx) : ( - ( n - i - 1 ) * incx );
+        idy = (incy > 0) ? (i * incy) : ( - ( n - i - 1 ) * incy );
+        y[idy] = z[idx];
+    }
+}
+
+// Explicit template instantiations
+template void ref_scal2v<float>(char, gtint_t, float, float*, gtint_t, float*, gtint_t);
+template void ref_scal2v<double>(char, gtint_t, double, double*, gtint_t, double*, gtint_t);
+template void ref_scal2v<scomplex>(char, gtint_t, scomplex, scomplex*, gtint_t, scomplex*, gtint_t);
+template void ref_scal2v<dcomplex>(char, gtint_t, dcomplex, dcomplex*, gtint_t, dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_scalv.cpp b/gtestsuite/testinghelpers/src/level1/ref_scalv.cpp
new file mode 100644
index 0000000000..c4ad195c36
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_scalv.cpp
@@ -0,0 +1,94 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_scalv.h"
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_scalv(char conjalpha, gtint_t n, T alpha, T* x, gtint_t incx)
+{
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_scal)( f77_int, scalar_t , T *, f77_int);
+    Fptr_ref_cblas_scal ref_cblas_scal;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_sscal");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_dscal");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_cscal");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_zscal");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_scalv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_scal) {
+        throw std::runtime_error("Error in ref_scalv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+#ifdef TEST_BLIS_TYPED
+    if( chkconj( conjalpha ) )
+    {
+        T alpha_conj = testinghelpers::conj<T>( alpha );
+        ref_cblas_scal( n, alpha_conj, x, incx );
+    }
+    else
+#endif
+    {
+        ref_cblas_scal( n, alpha, x, incx );
+    }
+
+}
+
+// Explicit template instantiations
+template void ref_scalv<float>(char, gtint_t, float, float*, gtint_t);
+template void ref_scalv<double>(char, gtint_t, double, double*, gtint_t);
+template void ref_scalv<scomplex>(char, gtint_t, scomplex, scomplex*, gtint_t);
+template void ref_scalv<dcomplex>(char, gtint_t, dcomplex, dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_subv.cpp b/gtestsuite/testinghelpers/src/level1/ref_subv.cpp
new file mode 100644
index 0000000000..40ddb3e02c
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_subv.cpp
@@ -0,0 +1,76 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "level1/ref_subv.h"
+
+namespace testinghelpers {
+
+// Since subv is not supported by BLAS/CBLAS, we have a local reference implementation.
+template<typename T>
+void ref_subv( char conj_x, gtint_t n, const T* xp, gtint_t incx,
+                                             T* y, gtint_t incy ) {
+    gtint_t i, ix, iy;
+    bool cfx    = chkconj( conj_x );
+    gtint_t svx = buff_dim(n, incx);
+
+    if (n == 0) {
+        return;
+    }
+
+    std::vector<T> X( svx );
+    memcpy(X.data(), xp, svx*sizeof(T));
+
+    if( cfx ) {
+        conj<T>( X.data(), n, incx );
+    }
+
+    ix = 0;
+    iy = 0;
+    for(i = 0 ; i < n ; i++) {
+        y[iy] = y[iy] - X[ix];
+        ix    = ix + incx;
+        iy    = iy + incy;
+    }
+
+    return;
+}
+
+// Explicit template instantiations
+template void ref_subv<float>(char, gtint_t, const float*, gtint_t, float*, gtint_t);
+template void ref_subv<double>(char, gtint_t, const double*, gtint_t, double*, gtint_t);
+template void ref_subv<scomplex>(char, gtint_t, const scomplex*, gtint_t, scomplex*, gtint_t);
+template void ref_subv<dcomplex>(char, gtint_t, const dcomplex*, gtint_t, dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level1/ref_xpbyv.cpp b/gtestsuite/testinghelpers/src/level1/ref_xpbyv.cpp
new file mode 100644
index 0000000000..27773a08cb
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level1/ref_xpbyv.cpp
@@ -0,0 +1,128 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level1/ref_xpbyv.h"
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_xpbyv( char conj_x, gtint_t n, const T* x,
+                    gtint_t incx, T beta, T* y, gtint_t incy )
+{
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_scal)( f77_int, scalar_t , const T *, f77_int);
+    Fptr_ref_cblas_scal ref_cblas_scal;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_sscal");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_dscal");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_cscal");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_scal = (Fptr_ref_cblas_scal)dlsym(refCBLASModule.get(), "cblas_zscal");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_axpby.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_scal) {
+        throw std::runtime_error("Error in ref_axpby.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_scal( n, beta, y, incy );
+    typedef void (*Fptr_ref_cblas_axpby)( f77_int, scalar_t , const T *, f77_int , T *, f77_int );
+    Fptr_ref_cblas_axpby ref_cblas_axpby;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_saxpy");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_daxpy");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_caxpy");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_axpby = (Fptr_ref_cblas_axpby)dlsym(refCBLASModule.get(), "cblas_zaxpy");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_axpby.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_axpby) {
+        throw std::runtime_error("Error in ref_axpby.cpp: Function pointer == 0 -- symbol not found.");
+    }
+    T one;
+    initone<T>(one);
+#if TEST_BLIS_TYPED
+    if( chkconj( conj_x ) )
+    {
+        std::vector<T> X( testinghelpers::buff_dim(n, incx) );
+        memcpy( X.data(), x, testinghelpers::buff_dim(n, incx)*sizeof(T) );
+        testinghelpers::conj<T>( X.data(), n, incx );
+        ref_cblas_axpby( n, one, X.data(), incx, y, incy );
+    }
+    else
+#endif
+    {
+        ref_cblas_axpby( n, one, x, incx, y, incy );
+    }
+
+}
+
+// Explicit template instantiations
+template void ref_xpbyv<float>(char, gtint_t, const float*, gtint_t, float, float*, gtint_t);
+template void ref_xpbyv<double>(char, gtint_t, const double*, gtint_t, double, double*, gtint_t);
+template void ref_xpbyv<scomplex>(char, gtint_t, const scomplex*, gtint_t, scomplex, scomplex*, gtint_t);
+template void ref_xpbyv<dcomplex>(char, gtint_t, const dcomplex*, gtint_t, dcomplex, dcomplex*, gtint_t);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_gemv.cpp b/gtestsuite/testinghelpers/src/level2/ref_gemv.cpp
new file mode 100644
index 0000000000..99168283f5
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_gemv.cpp
@@ -0,0 +1,113 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_gemv.h"
+
+/*
+ * ==========================================================================
+ * GEMV performs one of the matrix-vector operations
+ *    y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+ *    y := alpha*A**H*x + beta*y,
+ * ==========================================================================
+*/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_gemv( char storage, char trans, char conjx, gtint_t m, gtint_t n, T alpha,
+    T *ap, gtint_t lda, T *xp, gtint_t incx, T beta, T *yp, gtint_t incy )
+{
+    gtint_t lenx = chknotrans( trans ) ? n : m ;
+    bool cfx = chkconj( conjx );
+
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_TRANSPOSE cblas_trans;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_trans( trans, &cblas_trans );
+
+    if( cfx ) {
+        conj<T>( xp, lenx, incx );
+    }
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_gemv)( const CBLAS_ORDER, const CBLAS_TRANSPOSE,
+                    const f77_int, const f77_int, const scalar_t, const T*, f77_int,
+                    const T*, f77_int, const scalar_t, T*, f77_int);
+    Fptr_ref_cblas_gemv ref_cblas_gemv;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_gemv = (Fptr_ref_cblas_gemv)dlsym(refCBLASModule.get( ), "cblas_sgemv");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_gemv = (Fptr_ref_cblas_gemv)dlsym(refCBLASModule.get(), "cblas_dgemv");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_gemv = (Fptr_ref_cblas_gemv)dlsym(refCBLASModule.get(), "cblas_cgemv");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_gemv = (Fptr_ref_cblas_gemv)dlsym(refCBLASModule.get(), "cblas_zgemv");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_gemv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_gemv) {
+        throw std::runtime_error("Error in ref_gemv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_gemv( cblas_order, cblas_trans, m, n, alpha, ap, lda,
+                                                    xp, incx, beta, yp, incy );
+
+}
+
+// Explicit template instantiations
+template void ref_gemv<float>( char , char , char , gtint_t , gtint_t , float ,
+             float *, gtint_t , float *, gtint_t , float , float *, gtint_t );
+template void ref_gemv<double>( char , char , char , gtint_t , gtint_t , double ,
+             double *, gtint_t , double *, gtint_t , double , double *, gtint_t );
+template void ref_gemv<scomplex>( char , char , char , gtint_t , gtint_t , scomplex ,
+             scomplex *, gtint_t , scomplex *, gtint_t , scomplex , scomplex *, gtint_t );
+template void ref_gemv<dcomplex>( char , char , char , gtint_t , gtint_t , dcomplex ,
+             dcomplex *, gtint_t , dcomplex *, gtint_t , dcomplex , dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_ger.cpp b/gtestsuite/testinghelpers/src/level2/ref_ger.cpp
new file mode 100644
index 0000000000..ade3ee35e1
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_ger.cpp
@@ -0,0 +1,113 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_ger.h"
+
+/*
+ * ==========================================================================
+ * GER performs the rank 1 operation
+ *    A := alpha*x*y**T + A,
+ * where alpha is a scalar, x is an m element vector, y is an n element
+ * vector and A is an m by n matrix.
+ * ==========================================================================
+*/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n,
+    T alpha, T *xp, gtint_t incx, T *yp, gtint_t incy, T *ap, gtint_t lda )
+{
+    bool cfy = chkconj( conjy );
+
+    enum CBLAS_ORDER cblas_order;
+    char_to_cblas_order( storage, &cblas_order );
+
+    std::vector<T> X( buff_dim(m, incx) );
+    memcpy(X.data(), xp, (buff_dim(m, incx)*sizeof(T)));
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_ger)( const CBLAS_ORDER, const f77_int, const f77_int,
+                     const scalar_t, const T*, f77_int,  const T*, f77_int, T*, f77_int );
+    Fptr_ref_cblas_ger ref_cblas_ger;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get( ), "cblas_sger");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_dger");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+      if( cfy )
+        ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_cgerc");
+       else
+        ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_cgeru");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+      if( cfy )
+        ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_zgerc");
+       else
+        ref_cblas_ger = (Fptr_ref_cblas_ger)dlsym(refCBLASModule.get(), "cblas_zgeru");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_ger.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_ger) {
+        throw std::runtime_error("Error in ref_ger.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_ger( cblas_order, m, n, alpha, xp, incx, yp, incy, ap, lda );
+
+}
+
+// Explicit template instantiations
+template void ref_ger<float>( char, char, char, gtint_t, gtint_t,
+              float, float *, gtint_t, float *, gtint_t, float *, gtint_t );
+template void ref_ger<double>( char, char, char, gtint_t, gtint_t,
+              double, double *, gtint_t, double *, gtint_t, double *, gtint_t );
+template void ref_ger<scomplex>( char, char, char, gtint_t, gtint_t,
+              scomplex, scomplex *, gtint_t, scomplex *, gtint_t, scomplex *, gtint_t );
+template void ref_ger<dcomplex>( char, char, char, gtint_t, gtint_t,
+              dcomplex, dcomplex *, gtint_t, dcomplex *, gtint_t, dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_hemv.cpp b/gtestsuite/testinghelpers/src/level2/ref_hemv.cpp
new file mode 100644
index 0000000000..93571be74f
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_hemv.cpp
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_hemv.h"
+
+/*
+ * ==========================================================================
+ * HEMV performs the matrix-vector  operation
+ *    y := alpha*A*x + beta*y
+ * where alpha and beta are scalars, x and y are n element vectors and
+ * A is an n by n hermitian matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_hemv( char storage, char uploa, char conja, char conjx, gtint_t n,
+    T* alpha, T *ap, gtint_t lda, T *xp, gtint_t incx, T* beta,
+    T *yp, gtint_t incy )
+{
+
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+
+    typedef void (*Fptr_ref_cblas_hemv)( const CBLAS_ORDER, const CBLAS_UPLO, const f77_int,
+                         const T*, const T*, f77_int, const T*, f77_int, const T*, T*, f77_int);
+
+    Fptr_ref_cblas_hemv ref_cblas_hemv;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_hemv = (Fptr_ref_cblas_hemv)dlsym(refCBLASModule.get(), "cblas_chemv");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_hemv = (Fptr_ref_cblas_hemv)dlsym(refCBLASModule.get(), "cblas_zhemv");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_hemv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_hemv) {
+        throw std::runtime_error("Error in ref_hemv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_hemv( cblas_order, cblas_uploa, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+
+}
+
+// Explicit template instantiations
+template void ref_hemv<scomplex>( char, char, char, char, gtint_t, scomplex *,
+              scomplex *, gtint_t, scomplex *, gtint_t, scomplex *, scomplex *, gtint_t );
+template void ref_hemv<dcomplex>( char, char, char, char, gtint_t, dcomplex *,
+              dcomplex *, gtint_t, dcomplex *, gtint_t, dcomplex *, dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_her.cpp b/gtestsuite/testinghelpers/src/level2/ref_her.cpp
new file mode 100644
index 0000000000..3be456e7bc
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_her.cpp
@@ -0,0 +1,92 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_her.h"
+
+/*
+ * ==========================================================================
+ * HER performs the hermitian rank 1 operation
+ *    A := alpha*x*x**H + A
+ *  where alpha is a real scalar, x is an n element vector and A is an
+ *  n by n hermitian matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T, typename Tr>
+void ref_her( char storage, char uploa, char conjx, gtint_t n, Tr alpha,
+                             T *xp, gtint_t incx, T *ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+
+    typedef void (*Fptr_ref_cblas_her)( const CBLAS_ORDER, const CBLAS_UPLO, const f77_int,
+                                        const Tr, const T*, f77_int, T*, f77_int);
+    Fptr_ref_cblas_her ref_cblas_her;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_her = (Fptr_ref_cblas_her)dlsym(refCBLASModule.get(), "cblas_cher");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_her = (Fptr_ref_cblas_her)dlsym(refCBLASModule.get(), "cblas_zher");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_her.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_her) {
+        throw std::runtime_error("Error in ref_her.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_her( cblas_order, cblas_uploa, n, alpha, xp, incx, ap, lda );
+
+}
+
+// Explicit template instantiations
+template void ref_her<scomplex, float>( char , char , char , gtint_t , float ,
+                               scomplex *, gtint_t , scomplex *, gtint_t );
+template void ref_her<dcomplex, double>( char , char , char , gtint_t , double ,
+                               dcomplex *, gtint_t , dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_her2.cpp b/gtestsuite/testinghelpers/src/level2/ref_her2.cpp
new file mode 100644
index 0000000000..266909547e
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_her2.cpp
@@ -0,0 +1,93 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_her2.h"
+
+/*
+ * ==========================================================================
+ * HER2  performs the hermitian rank 2 operation
+ *    A := alpha*x*y**H + conjg( alpha )*y*x**H + A,
+ * where alpha is a scalar, x and y are n element vectors and A is an n
+ * by n hermitian matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_her2( char storage, char uploa, char conjx, char conjy, gtint_t n,
+    T* alpha, T *xp, gtint_t incx, T *yp, gtint_t incy, T *ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+
+    typedef void (*Fptr_ref_cblas_her2)( const CBLAS_ORDER, const CBLAS_UPLO, const f77_int,
+                         const T*, const T*, f77_int, const T*, f77_int, T*, f77_int);
+
+    Fptr_ref_cblas_her2 ref_cblas_her2;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_her2 = (Fptr_ref_cblas_her2)dlsym(refCBLASModule.get(), "cblas_cher2");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_her2 = (Fptr_ref_cblas_her2)dlsym(refCBLASModule.get(), "cblas_zher2");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_her2.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_her2) {
+        throw std::runtime_error("Error in ref_her2.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_her2( cblas_order, cblas_uploa, n, alpha, xp, incx, yp, incy, ap, lda );
+
+}
+
+// Explicit template instantiations
+template void ref_her2<scomplex>( char, char, char, char, gtint_t, scomplex *,
+              scomplex *, gtint_t, scomplex *, gtint_t, scomplex *, gtint_t );
+template void ref_her2<dcomplex>( char, char, char, char, gtint_t, dcomplex *,
+              dcomplex *, gtint_t, dcomplex *, gtint_t, dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_symv.cpp b/gtestsuite/testinghelpers/src/level2/ref_symv.cpp
new file mode 100644
index 0000000000..5c4afb668f
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_symv.cpp
@@ -0,0 +1,93 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_symv.h"
+
+/*
+ * ==========================================================================
+ * SYMV performs the matrix-vector  operation
+ *    y := alpha*A*x + beta*y
+ * where alpha and beta are scalars, x and y are n element vectors and
+ * A is an n by n symmetric matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_symv( char storage, char uploa, char conja, char conjx, gtint_t n,
+    T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx, T *beta,
+    T *yp, gtint_t incy )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+
+    typedef void (*Fptr_ref_cblas_symv)( const CBLAS_ORDER, const CBLAS_UPLO, const f77_int,
+                         const T, const T*, f77_int, const T*, f77_int, const T, T*, f77_int);
+
+    Fptr_ref_cblas_symv ref_cblas_symv;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_symv = (Fptr_ref_cblas_symv)dlsym(refCBLASModule.get(), "cblas_ssymv");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_symv = (Fptr_ref_cblas_symv)dlsym(refCBLASModule.get(), "cblas_dsymv");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_symv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_symv) {
+        throw std::runtime_error("Error in ref_symv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_symv( cblas_order, cblas_uploa, n, *alpha, ap, lda, xp, incx, *beta, yp, incy );
+}
+
+// Explicit template instantiations
+template void ref_symv<float>( char, char, char, char, gtint_t, float *,
+              float *, gtint_t, float *, gtint_t, float *, float *, gtint_t );
+template void ref_symv<double>( char, char, char, char, gtint_t, double *,
+              double *, gtint_t, double *, gtint_t, double *, double *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_syr.cpp b/gtestsuite/testinghelpers/src/level2/ref_syr.cpp
new file mode 100644
index 0000000000..b9d0f69103
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_syr.cpp
@@ -0,0 +1,93 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_syr.h"
+
+/*
+ * ==========================================================================
+ * SYR performs the symmetric rank 1 operation
+ *    A := alpha*x*x**T + A,
+ *  where alpha is a real scalar, x is an n element vector and A is an
+ *  n by n symmetric matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_syr( char storage, char uploa, char conjx, gtint_t n, T alpha,
+                             T *xp, gtint_t incx, T *ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+
+    typedef void (*Fptr_ref_cblas_syr)( const CBLAS_ORDER, const CBLAS_UPLO, const f77_int,
+                                        const T, const T*, f77_int, T*, f77_int);
+
+    Fptr_ref_cblas_syr ref_cblas_syr;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_syr = (Fptr_ref_cblas_syr)dlsym(refCBLASModule.get(), "cblas_ssyr");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_syr = (Fptr_ref_cblas_syr)dlsym(refCBLASModule.get(), "cblas_dsyr");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_syr.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_syr) {
+        throw std::runtime_error("Error in ref_syr.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_syr( cblas_order, cblas_uploa, n, alpha, xp, incx, ap, lda );
+
+}
+
+// Explicit template instantiations
+template void ref_syr<float>( char , char , char, gtint_t , float ,
+                               float *, gtint_t , float *, gtint_t );
+template void ref_syr<double>( char , char , char, gtint_t , double ,
+                               double *, gtint_t , double *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_syr2.cpp b/gtestsuite/testinghelpers/src/level2/ref_syr2.cpp
new file mode 100644
index 0000000000..2fdc09362c
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_syr2.cpp
@@ -0,0 +1,93 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_syr2.h"
+
+/*
+ * ==========================================================================
+ * SYR2  performs the symmetric rank 2 operation
+ *    A := alpha*x*y**T + alpha*y*x**T + A,
+ * where alpha is a scalar, x and y are n element vectors and A is an n
+ * by n symmetric matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_syr2( char storage, char uploa, char conjx, char conjy, gtint_t n,
+   T alpha, T *xp, gtint_t incx, T *yp, gtint_t incy, T *ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+
+    typedef void (*Fptr_ref_cblas_syr2)( const CBLAS_ORDER, const CBLAS_UPLO, const f77_int,
+                const T, const T*, f77_int, const T*, f77_int, T*, f77_int);
+
+    Fptr_ref_cblas_syr2 ref_cblas_syr2;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_syr2 = (Fptr_ref_cblas_syr2)dlsym(refCBLASModule.get(), "cblas_ssyr2");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_syr2 = (Fptr_ref_cblas_syr2)dlsym(refCBLASModule.get(), "cblas_dsyr2");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_syr2.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_syr2) {
+        throw std::runtime_error("Error in ref_syr2.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_syr2( cblas_order, cblas_uploa, n, alpha, xp, incx, yp, incy, ap, lda );
+
+}
+
+// Explicit template instantiations
+template void ref_syr2<float>( char, char, char, char, gtint_t, float,
+              float *, gtint_t, float *, gtint_t, float *, gtint_t );
+template void ref_syr2<double>( char, char, char, char, gtint_t, double,
+              double *, gtint_t, double *, gtint_t, double *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_trmv.cpp b/gtestsuite/testinghelpers/src/level2/ref_trmv.cpp
new file mode 100644
index 0000000000..62beea0520
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_trmv.cpp
@@ -0,0 +1,112 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_trmv.h"
+
+/*
+ * ==========================================================================
+ * TRMV  performs one of the matrix-vector operations
+ *    x := alpha * transa(A) * x
+ * where x is an n element vector and  A is an n by n unit, or non-unit,
+ * upper or lower triangular matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trmv( char storage, char uploa, char transa, char diaga,
+    gtint_t n, T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+    enum CBLAS_TRANSPOSE cblas_trans;
+    enum CBLAS_DIAG cblas_diaga;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+    char_to_cblas_trans( transa, &cblas_trans );
+    char_to_cblas_diag( diaga, &cblas_diaga );
+
+    alphax<T>( n, *alpha, xp, incx );
+
+    typedef void (*Fptr_ref_cblas_trmv)( const CBLAS_ORDER, const CBLAS_UPLO,
+                                         const CBLAS_TRANSPOSE, CBLAS_DIAG ,
+                                         f77_int, const T*, f77_int, T*, f77_int );
+
+    Fptr_ref_cblas_trmv ref_cblas_trmv;
+
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_trmv = (Fptr_ref_cblas_trmv)dlsym(refCBLASModule.get(), "cblas_strmv");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_trmv = (Fptr_ref_cblas_trmv)dlsym(refCBLASModule.get(), "cblas_dtrmv");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_trmv = (Fptr_ref_cblas_trmv)dlsym(refCBLASModule.get(), "cblas_ctrmv");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_trmv = (Fptr_ref_cblas_trmv)dlsym(refCBLASModule.get(), "cblas_ztrmv");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_trmv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_trmv) {
+        throw std::runtime_error("Error in ref_trmv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_trmv( cblas_order, cblas_uploa, cblas_trans, cblas_diaga, n, ap, lda, xp, incx );
+}
+
+// Explicit template instantiations
+template void ref_trmv<float>( char , char , char , char , gtint_t ,
+                              float *, float *, gtint_t , float *, gtint_t );
+template void ref_trmv<double>( char , char , char , char , gtint_t ,
+                              double *, double *, gtint_t , double *, gtint_t );
+template void ref_trmv<scomplex>( char , char , char , char , gtint_t ,
+                              scomplex *, scomplex *, gtint_t , scomplex *, gtint_t );
+template void ref_trmv<dcomplex>( char , char , char , char , gtint_t ,
+                              dcomplex *, dcomplex *, gtint_t , dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level2/ref_trsv.cpp b/gtestsuite/testinghelpers/src/level2/ref_trsv.cpp
new file mode 100644
index 0000000000..455ed8455d
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level2/ref_trsv.cpp
@@ -0,0 +1,111 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level2/ref_trsv.h"
+
+/*
+ * ==========================================================================
+ * TRSV Solves a triangular system of equations with a single value for the
+ *        right side
+ *    b := alpha * inv(transa(A)) * x_orig
+ * where b and x are n element vectors and A is an n by n unit, or non-unit,
+ * upper or lower triangular matrix.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trsv( char storage, char uploa, char transa, char diaga,
+    gtint_t n, T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+    enum CBLAS_TRANSPOSE cblas_trans;
+    enum CBLAS_DIAG cblas_diaga;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+    char_to_cblas_trans( transa, &cblas_trans );
+    char_to_cblas_diag( diaga, &cblas_diaga );
+
+    alphax<T>( n, *alpha, xp, incx );
+
+    typedef void (*Fptr_ref_cblas_trsv)( const CBLAS_ORDER, const CBLAS_UPLO,
+                                         const CBLAS_TRANSPOSE, CBLAS_DIAG ,
+                                         f77_int, const T*, f77_int, T*, f77_int );
+    Fptr_ref_cblas_trsv ref_cblas_trsv;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_trsv = (Fptr_ref_cblas_trsv)dlsym(refCBLASModule.get(), "cblas_strsv");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_trsv = (Fptr_ref_cblas_trsv)dlsym(refCBLASModule.get(), "cblas_dtrsv");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_trsv = (Fptr_ref_cblas_trsv)dlsym(refCBLASModule.get(), "cblas_ctrsv");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_trsv = (Fptr_ref_cblas_trsv)dlsym(refCBLASModule.get(), "cblas_ztrsv");
+    }
+    else
+    {
+      throw std::runtime_error("Error in ref_trsv.cpp: Invalid typename is passed function template.");
+    }
+    if (!ref_cblas_trsv) {
+        throw std::runtime_error("Error in ref_trsv.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_trsv( cblas_order, cblas_uploa, cblas_trans, cblas_diaga, n, ap, lda, xp, incx );
+}
+
+// Explicit template instantiations
+template void ref_trsv<float>( char , char , char , char , gtint_t ,
+                              float *, float *, gtint_t , float *, gtint_t );
+template void ref_trsv<double>( char , char , char , char , gtint_t ,
+                              double *, double *, gtint_t , double *, gtint_t );
+template void ref_trsv<scomplex>( char , char , char , char , gtint_t ,
+                              scomplex *, scomplex *, gtint_t , scomplex *, gtint_t );
+template void ref_trsv<dcomplex>( char , char , char , char , gtint_t ,
+                              dcomplex *, dcomplex *, gtint_t , dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_gemm.cpp b/gtestsuite/testinghelpers/src/level3/ref_gemm.cpp
new file mode 100644
index 0000000000..6a5987a363
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_gemm.cpp
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_gemm.h"
+
+/*
+ * ==========================================================================
+ * GEMM  performs one of the matrix-matrix operations
+ *    C := alpha*op( A )*op( B ) + beta*C,
+ * where  op( A ) is one of
+ *    op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H,
+ * alpha and beta are scalars, and A, B and C are matrices, with op( A )
+ * an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+ * ==========================================================================
+ */
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_gemm(char storage, char trnsa, char trnsb, gtint_t m, gtint_t n, gtint_t k,
+    T alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, T beta,  T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_TRANSPOSE cblas_transb;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_trans( trnsa, &cblas_transa );
+    char_to_cblas_trans( trnsb, &cblas_transb );
+
+    if( trnsa == 'h' ) {
+        throw std::invalid_argument("Error in file src/level3/ref_gemm.cpp:"
+                    "Invalid input. To enable for 'h' update the code and create a temporary matrix A.");
+        //testinghelpers::conj<T>( storage, A.data(), m, k, lda );
+    }
+
+    if( trnsb == 'h' ) {
+        throw std::invalid_argument("Error in file src/level3/ref_gemm.cpp:"
+                    "Invalid input. To enable for 'h' update the code and create a temporary matrix B.");
+        //testinghelpers::conj<T>( storage, B.data(), k, n, ldb );
+    }
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_gemm)( const CBLAS_ORDER, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE,
+                    const f77_int, const f77_int, const f77_int, const scalar_t, const T*, f77_int,
+                    const T*, f77_int, const scalar_t, T*, f77_int);
+    Fptr_ref_cblas_gemm ref_cblas_gemm;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_gemm = (Fptr_ref_cblas_gemm)dlsym(refCBLASModule.get( ), "cblas_sgemm");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_gemm = (Fptr_ref_cblas_gemm)dlsym(refCBLASModule.get(), "cblas_dgemm");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_gemm = (Fptr_ref_cblas_gemm)dlsym(refCBLASModule.get(), "cblas_cgemm");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_gemm = (Fptr_ref_cblas_gemm)dlsym(refCBLASModule.get(), "cblas_zgemm");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_gemm.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_gemm ) {
+        throw std::runtime_error("Error in ref_gemm.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_gemm( cblas_order, cblas_transa, cblas_transb,
+                  m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+}
+
+// Explicit template instantiations
+template void ref_gemm<float>(char, char, char, gtint_t, gtint_t, gtint_t, float,
+                      float*, gtint_t, float*, gtint_t, float, float*, gtint_t );
+template void ref_gemm<double>(char, char, char, gtint_t, gtint_t, gtint_t, double,
+                      double*, gtint_t, double*, gtint_t, double, double*, gtint_t );
+template void ref_gemm<scomplex>(char, char, char, gtint_t, gtint_t, gtint_t, scomplex,
+                      scomplex*, gtint_t, scomplex*, gtint_t, scomplex, scomplex*, gtint_t );
+template void ref_gemm<dcomplex>(char, char, char, gtint_t, gtint_t, gtint_t, dcomplex,
+                      dcomplex*, gtint_t, dcomplex*, gtint_t, dcomplex, dcomplex*, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_gemmt.cpp b/gtestsuite/testinghelpers/src/level3/ref_gemmt.cpp
new file mode 100644
index 0000000000..4c232e643b
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_gemmt.cpp
@@ -0,0 +1,175 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_gemm.h"
+#include "level3/ref_gemmt.h"
+
+/*
+ * ==========================================================================
+ *  GEMMT performs one of the matrix-matrix operations
+ *     C := alpha*op( A )*op( B ) + beta*C,
+ *  where  op( X ) is one of
+ *     op( X ) = X   or   op( X ) = A**T   or   op( X ) = X**H,
+ *  alpha and beta are scalars, and A, B and C are matrices, with op( A )
+ *  an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.
+ *  Only accesses and updates the upper or the lower triangular part.
+ * ==========================================================================
+**/
+
+namespace testinghelpers {
+#if 1
+template <typename T>
+void ref_gemmt (
+    char storage, char uplo, char trnsa, char trnsb,
+    gtint_t n, gtint_t k,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+) {
+    gtint_t smc = testinghelpers::matsize( storage, 'n', n, n, ldc );
+    std::vector<T> C( smc );
+    memcpy(C.data(), cp, (smc*sizeof(T)));
+    ref_gemm<T>(storage, trnsa, trnsb, n, n, k, alpha, ap, lda, bp, ldb, beta, C.data(), ldc);
+    if( (storage=='c')||(storage=='C') )
+    {
+        for(gtint_t j=0; j<n; j++)
+        {
+            for(gtint_t i=0; i<n; i++)
+            {
+                if( (uplo=='u')||(uplo=='U') )
+                {
+                    if(i<=j) cp[i+j*ldc] = C[i+j*ldc];
+                }
+                else if ( (uplo=='l')||(uplo=='L') )
+                {
+                    if (i>=j) cp[i+j*ldc] = C[i+j*ldc];
+                }
+                else
+                    throw std::runtime_error("Error in level3/ref_gemmt.cpp: side must be 'u' or 'l'.");
+            }
+        }
+    } else
+    {
+        for(gtint_t i=0; i<n; i++)
+        {
+            for(gtint_t j=0; j<n; j++)
+            {
+                if( (uplo=='u')||(uplo=='U') )
+                {
+                    if(i<=j) cp[j+i*ldc] = C[j+i*ldc];
+                }
+                else if ( (uplo=='l')||(uplo=='L') )
+                {
+                    if (i>=j) cp[j+i*ldc] = C[j+i*ldc];
+                }
+                else
+                    throw std::runtime_error("Error in level3/ref_gemmt.cpp: side must be 'u' or 'l'.");
+            }
+        }
+    }
+}
+#else
+template <typename T>
+void ref_gemmt (
+    char storage, char uplo, char trnsa, char trnsb,
+    gtint_t n, gtint_t k,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_TRANSPOSE cblas_transb;
+    enum CBLAS_UPLO cblas_uplo;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_trans( trnsa, &cblas_transa );
+    char_to_cblas_trans( trnsb, &cblas_transb );
+    char_to_cblas_uplo( uplo, &cblas_uplo );
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_gemmt)( const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE, const CBLAS_TRANSPOSE,
+                    const f77_int, const f77_int, const scalar_t, const T*, f77_int,
+                    const T*, f77_int, const scalar_t, T*, f77_int);
+    Fptr_ref_cblas_gemmt ref_cblas_gemmt;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)dlsym(refCBLASModule.get( ), "cblas_sgemmt");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)dlsym(refCBLASModule.get(), "cblas_dgemmt");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)dlsym(refCBLASModule.get(), "cblas_cgemmt");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_gemmt = (Fptr_ref_cblas_gemmt)dlsym(refCBLASModule.get(), "cblas_zgemmt");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_gemmt.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_gemmt ) {
+        throw std::runtime_error("Error in ref_gemmt.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_gemmt( cblas_order, cblas_uplo, cblas_transa, cblas_transb,
+                              n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+}
+#endif
+// Explicit template instantiations
+template void ref_gemmt<float>(char, char, char, char, gtint_t, gtint_t, float,
+                      float*, gtint_t, float*, gtint_t, float, float*, gtint_t );
+template void ref_gemmt<double>(char, char, char, char, gtint_t, gtint_t, double,
+                      double*, gtint_t, double*, gtint_t, double, double*, gtint_t );
+template void ref_gemmt<scomplex>(char, char, char, char, gtint_t, gtint_t, scomplex,
+                      scomplex*, gtint_t, scomplex*, gtint_t, scomplex, scomplex*, gtint_t );
+template void ref_gemmt<dcomplex>(char, char, char, char, gtint_t, gtint_t, dcomplex,
+                      dcomplex*, gtint_t, dcomplex*, gtint_t, dcomplex, dcomplex*, gtint_t );
+
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_hemm.cpp b/gtestsuite/testinghelpers/src/level3/ref_hemm.cpp
new file mode 100644
index 0000000000..8bbafc0afe
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_hemm.cpp
@@ -0,0 +1,92 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_hemm.h"
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_hemm (
+    char storage, char side, char uplo, char conja, char transb,
+    gtint_t m, gtint_t n,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+) {
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_SIDE cblas_side;
+    enum CBLAS_UPLO cblas_uplo;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_side( side, &cblas_side );
+    char_to_cblas_uplo( uplo, &cblas_uplo );
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_hemm)( const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO,
+                    const f77_int, const f77_int, const scalar_t, const T*, f77_int,
+                    const T*, f77_int, const scalar_t, T*, f77_int);
+    Fptr_ref_cblas_hemm ref_cblas_hemm;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_hemm = (Fptr_ref_cblas_hemm)dlsym(refCBLASModule.get(), "cblas_chemm");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_hemm = (Fptr_ref_cblas_hemm)dlsym(refCBLASModule.get(), "cblas_zhemm");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_hemm.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_hemm ) {
+        throw std::runtime_error("Error in ref_hemm.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_hemm( cblas_order, cblas_side, cblas_uplo,
+                  m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+}
+// Explicit template instantiations
+template void ref_hemm<scomplex>(char, char, char, char, char, gtint_t, gtint_t, scomplex,
+                      scomplex*, gtint_t, scomplex*, gtint_t, scomplex, scomplex*, gtint_t );
+template void ref_hemm<dcomplex>(char, char, char, char, char, gtint_t, gtint_t, dcomplex,
+                      dcomplex*, gtint_t, dcomplex*, gtint_t, dcomplex, dcomplex*, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_her2k.cpp b/gtestsuite/testinghelpers/src/level3/ref_her2k.cpp
new file mode 100644
index 0000000000..f7303ed998
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_her2k.cpp
@@ -0,0 +1,91 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_her2k.h"
+
+namespace testinghelpers {
+
+template <typename T, typename RT>
+void ref_her2k(
+    char storage, char uplo, char transa, char transb,
+    gtint_t m, gtint_t k,
+    T* alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    RT beta,
+    T* cp, gtint_t ldc
+) {
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+    enum CBLAS_TRANSPOSE cblas_transa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uplo, &cblas_uplo );
+    char_to_cblas_trans( transa, &cblas_transa );
+
+    typedef void (*Fptr_ref_cblas_her2k)( const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE,
+                    const f77_int, const f77_int, const T*, const T*, f77_int,
+                    const T*, f77_int, const RT, T*, f77_int);
+    Fptr_ref_cblas_her2k ref_cblas_her2k;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_her2k = (Fptr_ref_cblas_her2k)dlsym(refCBLASModule.get(), "cblas_cher2k");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_her2k = (Fptr_ref_cblas_her2k)dlsym(refCBLASModule.get(), "cblas_zher2k");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_her2k.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_her2k ) {
+        throw std::runtime_error("Error in ref_her2k.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_her2k( cblas_order, cblas_uplo, cblas_transa,
+                  m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+}
+
+// Explicit template instantiations
+template void ref_her2k<scomplex>(char, char, char, char, gtint_t, gtint_t, scomplex*,
+                      scomplex*, gtint_t, scomplex*, gtint_t, float, scomplex*, gtint_t );
+template void ref_her2k<dcomplex>(char, char, char, char, gtint_t, gtint_t, dcomplex*,
+                      dcomplex*, gtint_t, dcomplex*, gtint_t, double, dcomplex*, gtint_t );
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_herk.cpp b/gtestsuite/testinghelpers/src/level3/ref_herk.cpp
new file mode 100644
index 0000000000..1f6c48bdce
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_herk.cpp
@@ -0,0 +1,91 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_herk.h"
+
+namespace testinghelpers {
+
+template <typename T, typename RT>
+void ref_herk(
+    char storage, char uplo, char trnsa,
+    gtint_t m, gtint_t k,
+    RT alpha,
+    T* ap, gtint_t lda,
+    RT beta,
+    T* cp, gtint_t ldc
+) {
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+    enum CBLAS_TRANSPOSE cblas_transa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uplo, &cblas_uplo );
+    char_to_cblas_trans( trnsa, &cblas_transa );
+
+    typedef void (*Fptr_ref_cblas_herk)( const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE,
+                    const f77_int, const f77_int, const RT, const T*, f77_int,
+                    const RT, T*, f77_int);
+    Fptr_ref_cblas_herk ref_cblas_herk;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_herk = (Fptr_ref_cblas_herk)dlsym(refCBLASModule.get(), "cblas_cherk");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_herk = (Fptr_ref_cblas_herk)dlsym(refCBLASModule.get(), "cblas_zherk");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_herk.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_herk ) {
+        throw std::runtime_error("Error in ref_herk.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_herk( cblas_order, cblas_uplo, cblas_transa,
+                  m, k, alpha, ap, lda, beta, cp, ldc );
+}
+
+// Explicit template instantiations
+template void ref_herk<scomplex>(char, char, char, gtint_t, gtint_t, float,
+                      scomplex*, gtint_t, float, scomplex*, gtint_t );
+template void ref_herk<dcomplex>(char, char, char, gtint_t, gtint_t, double,
+                      dcomplex*, gtint_t, double, dcomplex*, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_symm.cpp b/gtestsuite/testinghelpers/src/level3/ref_symm.cpp
new file mode 100644
index 0000000000..a784b804c1
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_symm.cpp
@@ -0,0 +1,104 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_symm.h"
+
+namespace testinghelpers {
+
+template<typename T>
+void ref_symm (
+    char storage, char side, char uplo, char conja, char transb,
+    gtint_t m, gtint_t n,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+) {
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_SIDE cblas_side;
+    enum CBLAS_UPLO cblas_uplo;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_side( side, &cblas_side );
+    char_to_cblas_uplo( uplo, &cblas_uplo );
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_symm)( const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO,
+                    const f77_int, const f77_int, const scalar_t, const T*, f77_int,
+                    const T*, f77_int, const scalar_t, T*, f77_int);
+    Fptr_ref_cblas_symm ref_cblas_symm;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_symm = (Fptr_ref_cblas_symm)dlsym(refCBLASModule.get( ), "cblas_ssymm");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_symm = (Fptr_ref_cblas_symm)dlsym(refCBLASModule.get(), "cblas_dsymm");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_symm = (Fptr_ref_cblas_symm)dlsym(refCBLASModule.get(), "cblas_csymm");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_symm = (Fptr_ref_cblas_symm)dlsym(refCBLASModule.get(), "cblas_zsymm");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_symm.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_symm ) {
+        throw std::runtime_error("Error in ref_symm.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_symm( cblas_order, cblas_side, cblas_uplo,
+                  m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+}
+// Explicit template instantiations
+template void ref_symm<float>(char, char, char, char, char, gtint_t, gtint_t, float,
+                      float*, gtint_t, float*, gtint_t, float, float*, gtint_t );
+template void ref_symm<double>(char, char, char, char, char, gtint_t, gtint_t, double,
+                      double*, gtint_t, double*, gtint_t, double, double*, gtint_t );
+template void ref_symm<scomplex>(char, char, char, char, char, gtint_t, gtint_t, scomplex,
+                      scomplex*, gtint_t, scomplex*, gtint_t, scomplex, scomplex*, gtint_t );
+template void ref_symm<dcomplex>(char, char, char, char, char, gtint_t, gtint_t, dcomplex,
+                      dcomplex*, gtint_t, dcomplex*, gtint_t, dcomplex, dcomplex*, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_syr2k.cpp b/gtestsuite/testinghelpers/src/level3/ref_syr2k.cpp
new file mode 100644
index 0000000000..49cb1cf5af
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_syr2k.cpp
@@ -0,0 +1,104 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_syr2k.h"
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_syr2k(
+    char storage, char uplo, char transa, char transb,
+    gtint_t m, gtint_t k,
+    T alpha,
+    T* ap, gtint_t lda,
+    T* bp, gtint_t ldb,
+    T beta,
+    T* cp, gtint_t ldc
+) {
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+    enum CBLAS_TRANSPOSE cblas_transa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uplo, &cblas_uplo );
+    char_to_cblas_trans( transa, &cblas_transa );
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_syr2k)( const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE,
+                    const f77_int, const f77_int, const scalar_t, const T*, f77_int,
+                    const T*, f77_int, const scalar_t, T*, f77_int);
+    Fptr_ref_cblas_syr2k ref_cblas_syr2k;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)dlsym(refCBLASModule.get( ), "cblas_ssyr2k");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)dlsym(refCBLASModule.get(), "cblas_dsyr2k");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)dlsym(refCBLASModule.get(), "cblas_csyr2k");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_syr2k = (Fptr_ref_cblas_syr2k)dlsym(refCBLASModule.get(), "cblas_zsyr2k");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_syr2k.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_syr2k ) {
+        throw std::runtime_error("Error in ref_syr2k.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_syr2k( cblas_order, cblas_uplo, cblas_transa,
+                  m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+}
+
+// Explicit template instantiations
+template void ref_syr2k<float>(char, char, char, char, gtint_t, gtint_t, float,
+                      float*, gtint_t, float*, gtint_t, float, float*, gtint_t );
+template void ref_syr2k<double>(char, char, char, char, gtint_t, gtint_t, double,
+                      double*, gtint_t, double*, gtint_t, double, double*, gtint_t );
+template void ref_syr2k<scomplex>(char, char, char, char, gtint_t, gtint_t, scomplex,
+                      scomplex*, gtint_t, scomplex*, gtint_t, scomplex, scomplex*, gtint_t );
+template void ref_syr2k<dcomplex>(char, char, char, char, gtint_t, gtint_t, dcomplex,
+                      dcomplex*, gtint_t, dcomplex*, gtint_t, dcomplex, dcomplex*, gtint_t );
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_syrk.cpp b/gtestsuite/testinghelpers/src/level3/ref_syrk.cpp
new file mode 100644
index 0000000000..a834b3b0d7
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_syrk.cpp
@@ -0,0 +1,104 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_syrk.h"
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_syrk(
+    char storage, char uplo, char trnsa,
+    gtint_t m, gtint_t k,
+    T alpha,
+    T* ap, gtint_t lda,
+    T beta,
+    T* cp, gtint_t ldc
+) {
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+    enum CBLAS_TRANSPOSE cblas_transa;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_uplo( uplo, &cblas_uplo );
+    char_to_cblas_trans( trnsa, &cblas_transa );
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_syrk)( const CBLAS_ORDER, const CBLAS_UPLO, const CBLAS_TRANSPOSE,
+                    const f77_int, const f77_int, const scalar_t, const T*, f77_int,
+                    const scalar_t, T*, f77_int);
+    Fptr_ref_cblas_syrk ref_cblas_syrk;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_syrk = (Fptr_ref_cblas_syrk)dlsym(refCBLASModule.get( ), "cblas_ssyrk");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_syrk = (Fptr_ref_cblas_syrk)dlsym(refCBLASModule.get(), "cblas_dsyrk");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_syrk = (Fptr_ref_cblas_syrk)dlsym(refCBLASModule.get(), "cblas_csyrk");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_syrk = (Fptr_ref_cblas_syrk)dlsym(refCBLASModule.get(), "cblas_zsyrk");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_syrk.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_syrk ) {
+        throw std::runtime_error("Error in ref_syrk.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_syrk( cblas_order, cblas_uplo, cblas_transa,
+                  m, k, alpha, ap, lda, beta, cp, ldc );
+}
+
+// Explicit template instantiations
+template void ref_syrk<float>(char, char, char, gtint_t, gtint_t, float,
+                      float*, gtint_t, float, float*, gtint_t );
+template void ref_syrk<double>(char, char, char, gtint_t, gtint_t, double,
+                      double*, gtint_t, double, double*, gtint_t );
+template void ref_syrk<scomplex>(char, char, char, gtint_t, gtint_t, scomplex,
+                      scomplex*, gtint_t, scomplex, scomplex*, gtint_t );
+template void ref_syrk<dcomplex>(char, char, char, gtint_t, gtint_t, dcomplex,
+                      dcomplex*, gtint_t, dcomplex, dcomplex*, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_trmm.cpp b/gtestsuite/testinghelpers/src/level3/ref_trmm.cpp
new file mode 100644
index 0000000000..ebf08be5ca
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_trmm.cpp
@@ -0,0 +1,114 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_trmm.h"
+
+/*
+ * ==========================================================================
+ * TRMM  performs one of the matrix-matrix operations
+ *    B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+ * where  alpha  is a scalar,  B  is an m by n matrix,  A  is a unit, or
+ * non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+ *    op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+ * ==========================================================================
+ */
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trmm( char storage, char side, char uploa, char transa, char diaga,
+    gtint_t m, gtint_t n, T alpha, T *ap, gtint_t lda, T *bp, gtint_t ldb )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_SIDE cblas_side;
+    enum CBLAS_UPLO cblas_uploa;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_DIAG cblas_diaga;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_side( side, &cblas_side );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+    char_to_cblas_trans( transa, &cblas_transa );
+    char_to_cblas_diag( diaga, &cblas_diaga );
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_trmm)( const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO,
+                 const CBLAS_TRANSPOSE, const CBLAS_DIAG, const f77_int, const f77_int,
+                 const scalar_t, const T*, f77_int, const T*, f77_int );
+
+    Fptr_ref_cblas_trmm ref_cblas_trmm;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_trmm = (Fptr_ref_cblas_trmm)dlsym(refCBLASModule.get( ), "cblas_strmm");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_trmm = (Fptr_ref_cblas_trmm)dlsym(refCBLASModule.get(), "cblas_dtrmm");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_trmm = (Fptr_ref_cblas_trmm)dlsym(refCBLASModule.get(), "cblas_ctrmm");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_trmm = (Fptr_ref_cblas_trmm)dlsym(refCBLASModule.get(), "cblas_ztrmm");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_trmm.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_trmm ) {
+        throw std::runtime_error("Error in ref_trmm.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_trmm( cblas_order, cblas_side, cblas_uploa, cblas_transa,
+                    cblas_diaga, m, n, alpha, ap, lda, bp, ldb );
+}
+
+// Explicit template instantiations
+template void ref_trmm<float>( char, char, char, char, char,
+              gtint_t, gtint_t, float, float *, gtint_t, float *, gtint_t );
+template void ref_trmm<double>( char, char, char, char, char,
+           gtint_t, gtint_t, double, double *, gtint_t, double *, gtint_t );
+template void ref_trmm<scomplex>( char, char, char, char, char,
+        gtint_t, gtint_t, scomplex, scomplex *, gtint_t, scomplex *, gtint_t );
+template void ref_trmm<dcomplex>( char, char, char, char, char,
+        gtint_t, gtint_t, dcomplex, dcomplex *, gtint_t, dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp b/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp
new file mode 100644
index 0000000000..8f409773f7
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_trmm3.cpp
@@ -0,0 +1,268 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_trmm3.h"
+
+/*
+ * ==========================================================================
+ * TRMM3  performs one of the matrix-matrix operations
+ *    C := beta * C_orig + alpha * transa(A) * transb(B)
+ * or
+ *    C := beta * C_orig + alpha * transb(B) * transa(A)
+ * where alpha and beta are scalars, A is an triangular matrix
+ * and  B and C are m by n matrices.
+ * ==========================================================================
+ */
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trmm3( char storage, char side, char uploa, char trnsa, char diaga,
+    char trnsb, gtint_t M, gtint_t N, T alpha, T *A, gtint_t lda,
+    T *B, gtint_t ldb, T beta, T *C, gtint_t ldc )
+{
+
+    T one ;
+    T zero;
+    T tmp;
+    dim_t i, j, k;
+    initone<T>(one);
+    initzero<T>(zero);
+
+    //*     Test the input parameters.
+    bool lside  = ( testinghelpers::chksideleft( side ) );
+    bool upper  = ( testinghelpers::chkupper( uploa ) );
+    bool unitdg = ( testinghelpers::chkunitdiag( diaga ) );
+    bool transa = ( testinghelpers::chktrans( trnsa ) );
+    bool transb = ( testinghelpers::chktrans( trnsb ) );
+    bool conja  = ( testinghelpers::chktransconj( trnsa ) );
+    bool conjb  = ( testinghelpers::chktransconj( trnsb ) );
+
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+    dim_t rsc,csc;
+
+    if( (storage == 'c') || (storage == 'C') ) {
+        rsa = transa ? lda : 1 ;
+        csa = transa ? 1 : lda ;
+        rsb = transb ? ldb : 1 ;
+        csb = transb ? 1 : ldb ;
+        rsc = 1 ;
+        csc = ldc ;
+    }
+    else {
+        rsa = transa ? 1 : lda ;
+        csa = transa ? lda : 1 ;
+        rsb = transb ? 1 : ldb ;
+        csb = transb ? ldb : 1 ;
+        rsc = ldc ;
+        csc = 1 ;
+    }
+
+    if( (M == 0 || N == 0) || ( alpha == zero && beta == one ) )
+      return;
+
+    if( transa ) {
+      upper   = !upper;
+    }
+
+    gtint_t mn;
+    if( lside )  mn = M;
+    else         mn = N;
+
+    if( conja )
+    {
+        testinghelpers::conj<T>( storage, A, mn, mn, lda );
+    }
+
+    if( conjb )
+    {
+        testinghelpers::conj<T>( storage, B, M, N, lda );
+    }
+
+    //*     And when  alpha.eq.zero.
+    if( alpha == zero )
+    {
+        if( beta == zero )
+        {
+            for( j = 0 ; j < N ; j++ )
+            {
+                for( i = 0 ; i < M ; i++ )
+                {
+                    C[i*rsc + j*csc] = zero;
+                }
+            }
+        }
+        else
+        {
+            for( j = 0 ; j < N ; j++ )
+            {
+                for( i = 0 ; i < M ; i++ )
+                {
+                    C[i*rsc + j*csc] = beta*C[i*rsc + j*csc];
+                }
+            }
+        }
+        return;
+    }
+
+    if( unitdg )
+    {
+        for( i = 0 ; i < mn ; i++ )
+        {
+            for( j = 0 ; j < mn ; j++ )
+            {
+                if( i==j )
+                    A[i*rsa + j*csa] = one ;
+            }
+        }
+    }
+
+    //*     Start the operations.
+    if( lside )
+    {
+        //* Form  C := beta * C_orig + alpha * transa(A) * transb(B)
+        if( upper )
+        {
+            for( j = 0 ; j < N ; j++ )
+            {
+                for( i = 0 ; i < M ; i++ )
+                {
+                    tmp = zero;
+                    for( k = i ; k < M ; k++ )
+                    {
+                        auto val = A[i*rsa + k*csa] * B[k*rsb + j*csb];
+                        tmp = tmp + val ;
+                    }
+                    if( beta == zero )
+                    {
+                        C[i*rsc + j*csc] = alpha*tmp;
+                    }
+                    else
+                    {
+                        C[i*rsc + j*csc] = beta*C[i*rsc + j*csc] + alpha*tmp;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for( j = 0 ; j < N ; j++ )
+            {
+                for( i = 0 ; i < M ; i++ )
+                {
+                    tmp = zero;
+                    for( k = 0 ; k <= i ; k++ )
+                    {
+                        auto val = A[i*rsa + k*csa] * B[k*rsb + j*csb];
+                        tmp = tmp + val ;
+                    }
+                    if( beta == zero )
+                    {
+                        C[i*rsc + j*csc] = alpha*tmp;
+                    }
+                    else
+                    {
+                        C[i*rsc + j*csc] = beta*C[i*rsc + j*csc] + alpha*tmp;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        //* C := beta * C_orig + alpha * transb(B) * transa(A)
+        if( upper )
+        {
+            for( i = 0 ; i < M ; i++ )
+            {
+                for( j = 0 ; j < N ; j++ )
+                {
+                    tmp = zero ;
+                    for( k = 0 ; k <= j ; k++ )
+                    {
+                        auto val = B[i*rsb + k*csb]* A[k*rsa + j*csa];
+                        tmp = tmp + val ;
+                    }
+                    if( beta == zero )
+                    {
+                        C[i*rsc + j*csc] = alpha*tmp;
+                    }
+                    else
+                    {
+                        C[i*rsc + j*csc] = beta*C[i*rsc + j*csc] + alpha*tmp;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for( i = 0 ; i < M ; i++ )
+            {
+                for( j = 0 ; j < N ; j++ )
+                {
+                    tmp = zero ;
+                    for( k = j ; k < N ; k++ )
+                    {
+                        auto val = B[i*rsb + k*csb]* A[k*rsa + j*csa];
+                        tmp = tmp + val ;
+                    }
+                    if( beta == zero )
+                    {
+                        C[i*rsc + j*csc] = alpha*tmp;
+                    }
+                    else
+                    {
+                        C[i*rsc + j*csc] = beta*C[i*rsc + j*csc] + alpha*tmp;
+                    }
+                }
+            }
+        }
+    }
+    return;
+}
+
+// Explicit template instantiations
+template void ref_trmm3<float>( char, char, char, char, char, char, gtint_t, gtint_t,
+                    float, float *, gtint_t, float *, gtint_t, float, float *, gtint_t );
+template void ref_trmm3<double>( char, char, char, char, char, char, gtint_t, gtint_t,
+               double, double *, gtint_t, double *, gtint_t, double, double *, gtint_t );
+template void ref_trmm3<scomplex>( char, char, char, char, char, char, gtint_t, gtint_t,
+     scomplex, scomplex *, gtint_t, scomplex *, gtint_t, scomplex, scomplex *, gtint_t );
+template void ref_trmm3<dcomplex>( char, char, char, char, char, char, gtint_t, gtint_t,
+     dcomplex, dcomplex *, gtint_t, dcomplex *, gtint_t, dcomplex, dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/level3/ref_trsm.cpp b/gtestsuite/testinghelpers/src/level3/ref_trsm.cpp
new file mode 100644
index 0000000000..c5d326a5eb
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/level3/ref_trsm.cpp
@@ -0,0 +1,115 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "level3/ref_trsm.h"
+
+/*
+ * ==========================================================================
+ *  TRSM  solves one of the matrix equations
+ *     op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+ *  where alpha is a scalar, X and B are m by n matrices, A is a unit, or
+ *  non-unit,  upper or lower triangular matrix  and  op( A )  is one  of
+ *     op( A ) = A   or   op( A ) = A**T.
+ *  The matrix X is overwritten on B.
+ * ==========================================================================
+ */
+
+namespace testinghelpers {
+
+template <typename T>
+void ref_trsm( char storage, char side, char uploa, char transa, char diaga,
+    gtint_t m, gtint_t n, T alpha, T *ap, gtint_t lda, T *bp, gtint_t ldb )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_SIDE cblas_side;
+    enum CBLAS_UPLO cblas_uploa;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_DIAG cblas_diaga;
+
+    char_to_cblas_order( storage, &cblas_order );
+    char_to_cblas_side( side, &cblas_side );
+    char_to_cblas_uplo( uploa, &cblas_uploa );
+    char_to_cblas_trans( transa, &cblas_transa );
+    char_to_cblas_diag( diaga, &cblas_diaga );
+
+    using scalar_t = std::conditional_t<testinghelpers::type_info<T>::is_complex, T&, T>;
+    typedef void (*Fptr_ref_cblas_trsm)( const CBLAS_ORDER, const CBLAS_SIDE, const CBLAS_UPLO,
+                 const CBLAS_TRANSPOSE, const CBLAS_DIAG, const f77_int, const f77_int,
+                 const scalar_t, const T*, f77_int, const T*, f77_int );
+
+    Fptr_ref_cblas_trsm ref_cblas_trsm;
+
+    // Call C function
+    /* Check the typename T passed to this function template and call respective function.*/
+    if (typeid(T) == typeid(float))
+    {
+        ref_cblas_trsm = (Fptr_ref_cblas_trsm)dlsym(refCBLASModule.get( ), "cblas_strsm");
+    }
+    else if (typeid(T) == typeid(double))
+    {
+        ref_cblas_trsm = (Fptr_ref_cblas_trsm)dlsym(refCBLASModule.get(), "cblas_dtrsm");
+    }
+    else if (typeid(T) == typeid(scomplex))
+    {
+        ref_cblas_trsm = (Fptr_ref_cblas_trsm)dlsym(refCBLASModule.get(), "cblas_ctrsm");
+    }
+    else if (typeid(T) == typeid(dcomplex))
+    {
+        ref_cblas_trsm = (Fptr_ref_cblas_trsm)dlsym(refCBLASModule.get(), "cblas_ztrsm");
+    }
+    else
+    {
+        throw std::runtime_error("Error in ref_trsm.cpp: Invalid typename is passed function template.");
+    }
+    if( !ref_cblas_trsm ) {
+        throw std::runtime_error("Error in ref_trsm.cpp: Function pointer == 0 -- symbol not found.");
+    }
+
+    ref_cblas_trsm( cblas_order, cblas_side, cblas_uploa, cblas_transa,
+                    cblas_diaga, m, n, alpha, ap, lda, bp, ldb );
+}
+
+// Explicit template instantiations
+template void ref_trsm<float>( char, char, char, char, char,
+              gtint_t, gtint_t, float, float *, gtint_t, float *, gtint_t );
+template void ref_trsm<double>( char, char, char, char, char,
+           gtint_t, gtint_t, double, double *, gtint_t, double *, gtint_t );
+template void ref_trsm<scomplex>( char, char, char, char, char,
+        gtint_t, gtint_t, scomplex, scomplex *, gtint_t, scomplex *, gtint_t );
+template void ref_trsm<dcomplex>( char, char, char, char, char,
+        gtint_t, gtint_t, dcomplex, dcomplex *, gtint_t, dcomplex *, gtint_t );
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp b/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp
new file mode 100644
index 0000000000..d61b5735dd
--- /dev/null
+++ b/gtestsuite/testinghelpers/src/util/ref_nrm2.cpp
@@ -0,0 +1,90 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <dlfcn.h>
+#include "util/ref_nrm2.h"
+
+/*
+ * ==========================================================================
+ * NORMFV performs vector operations
+ *    Compute the euclidean norm of a vector
+ *    of the elements in a vector x of length n. The resulting norm is stored to norm
+ * ========================================================================
+**/
+
+namespace testinghelpers {
+
+template <typename T, typename Treal>
+Treal ref_nrm2(gtint_t n, T* x, gtint_t incx) {
+
+  typedef Treal (*Fptr_ref_cblas_nrm2)( f77_int, const T *, f77_int );
+  Fptr_ref_cblas_nrm2 ref_cblas_nrm2;
+
+  // Call C function
+  /* Check the typename T passed to this function template and call respective function.*/
+  if (typeid(T) == typeid(float))
+  {
+      ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)dlsym(refCBLASModule.get( ), "cblas_snrm2");
+  }
+  else if (typeid(T) == typeid(double))
+  {
+      ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)dlsym(refCBLASModule.get(), "cblas_dnrm2");
+  }
+  else if (typeid(T) == typeid(scomplex))
+  {
+      ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)dlsym(refCBLASModule.get(), "cblas_scnrm2");
+  }
+  else if (typeid(T) == typeid(dcomplex))
+  {
+      ref_cblas_nrm2 = (Fptr_ref_cblas_nrm2)dlsym(refCBLASModule.get(), "cblas_dznrm2");
+  }
+  else
+  {
+    throw std::runtime_error("Error in ref_nrm2.cpp: Invalid typename is passed function template.");
+  }
+  if (!ref_cblas_nrm2) {
+    throw std::runtime_error("Error in ref_nrm2.cpp: Function pointer == 0 -- symbol not found.");
+  }
+
+  return ref_cblas_nrm2(n, x, incx);
+}
+
+// Explicit template instantiations
+template float  ref_nrm2<float, float>(gtint_t n, float* x, gtint_t incx);
+template double ref_nrm2<double, double>(gtint_t n, double* x, gtint_t incx);
+template float  ref_nrm2<scomplex, float>(gtint_t n, scomplex* x, gtint_t incx);
+template double ref_nrm2<dcomplex, double>(gtint_t n, dcomplex* x, gtint_t incx);
+
+} //end of namespace testinghelpers
diff --git a/gtestsuite/testsuite/CMakeLists.txt b/gtestsuite/testsuite/CMakeLists.txt
new file mode 100644
index 0000000000..135403e0f8
--- /dev/null
+++ b/gtestsuite/testsuite/CMakeLists.txt
@@ -0,0 +1,102 @@
+#[=[
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+]=]
+
+# Fetch and Build GTest at configure time
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG        release-1.12.1
+)
+#set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+set(BUILD_GMOCK OFF CACHE BOOL "" FORCE)
+set(BUILD_GTEST ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+include(GoogleTest)
+
+# Return the list of the subdirectories in the directory curdir.
+MACRO(SUBDIRLIST result curdir)
+  FILE(GLOB children RELATIVE ${curdir} ${curdir}/*)
+  SET(dirlist "")
+  FOREACH(child ${children})
+    IF(IS_DIRECTORY ${curdir}/${child})
+      LIST(APPEND dirlist ${child})
+    ENDIF()
+  ENDFOREACH()
+  SET(${result} ${dirlist})
+ENDMACRO()
+
+SUBDIRLIST(DIRS ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(target_name "testsuite")
+foreach(dir ${DIRS})
+    add_custom_target(${target_name}.${dir})
+    SUBDIRLIST(SUBDIRS ${CMAKE_CURRENT_SOURCE_DIR}/${dir})
+    foreach(subdir ${SUBDIRS})
+      file(GLOB files ${CMAKE_CURRENT_SOURCE_DIR}/${dir}/${subdir}/*.cpp)
+      if(files)
+        add_executable(${target_name}.${dir}.${subdir} ${files})
+        set_target_properties(${target_name}.${dir}.${subdir} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+        set_target_properties(${target_name}.${dir}.${subdir} PROPERTIES OUTPUT_NAME ${target_name}.${dir}.${subdir})
+        target_include_directories(${target_name}.${dir}.${subdir} PUBLIC ${BLIS_INCLUDE} ${CMAKE_SOURCE_DIR}/testinghelpers/inc ${CMAKE_SOURCE_DIR}/testsuite/)
+        target_link_libraries(${target_name}.${dir}.${subdir} gtest gtest_main testinghelpers ${Blis_LIBRARY} ${COMMON_LIBS})
+        # if we test serial BLIS, but MKL is used as a reference we still need to set up OpenMP.
+        if( (ENABLE_THREADING STREQUAL "openmp") OR (REF_CBLAS STREQUAL "MKL"))
+            if(LINUX)
+                if(OpenMP_LIBRARY STREQUAL "GNU")
+                    target_link_libraries(${target_name}.${dir}.${subdir} -fopenmp)
+                else()
+                    target_link_libraries(${target_name}.${dir}.${subdir} iomp5)
+                endif()
+            endif()
+        endif()
+        if(ENABLE_ASAN)
+          target_link_libraries(${target_name}.${dir}.${subdir} -fsanitize=address)
+        endif()
+        if(ENABLE_COVERAGE)
+          target_link_libraries(${target_name}.${dir}.${subdir} "--coverage")
+        endif()
+        if(TEST_INTERFACE STREQUAL "BLAS")
+          target_compile_definitions(${target_name}.${dir}.${subdir} PUBLIC TEST_BLAS)
+        elseif(TEST_INTERFACE STREQUAL "CBLAS")
+          target_compile_definitions(${target_name}.${dir}.${subdir} PUBLIC TEST_CBLAS)
+        else() # BLIS_TYPED option
+          target_compile_definitions(${target_name}.${dir}.${subdir} PUBLIC TEST_BLIS_TYPED)
+        endif()
+        target_compile_definitions(${target_name}.${dir}.${subdir} PUBLIC ELEMENT_TYPE='${ELEMENT_TYPE}')
+        add_test(NAME ${target_name}.${dir}.${subdir} COMMAND ${target_name}.${dir}.${subdir})
+        add_dependencies(${target_name}.${dir} ${target_name}.${dir}.${subdir})
+      endif()
+    endforeach()
+endforeach()
+
+
diff --git a/gtestsuite/testsuite/inc/check_error.h b/gtestsuite/testsuite/inc/check_error.h
new file mode 100644
index 0000000000..25e3c16204
--- /dev/null
+++ b/gtestsuite/testsuite/inc/check_error.h
@@ -0,0 +1,495 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+#include "blis.h"
+#include <gtest/gtest.h>
+#include "common/testing_helpers.h"
+
+/**
+ * This file includes the functionality used to determine correctness of the results.
+ * We compare the results component-wise, meaning that for two scalars, we compare the scalars,
+ * and for two vectors or matrices we compare each element of the vector or matrix respectively.
+ *
+ * We have two separate cases:
+ * 1) it's meaningful to have NaNs and/or Infs at the results,
+ *    because we are testing the correct propagation of those values.
+ *    In this case the results could be either extreme values, or f.p. values,
+ *    and we need to be able to compare all of those following the rules below:
+ *    - if there are NaNs/Infs, check if both reference and blis solution has
+ *      NaN/Inf accordingly. Remember that for Infs we need to check the sign as well.
+ *    - if there are no NaNs/Infs, either do the bitwise comparison (if that's desired),
+ *      or call into getError() function. getError() will check if reference is less
+ *      than one, in which case will compute the absolute error, otherwise compute the
+ *      relative error.
+ *    - for complex numbers, we need to check for all possible combinations of NaN/Inf/FP
+ *      for real and imaginary parts. So that will be a combination of the two steps above.
+ * 2) it's not meaningful to check for NaNs and Infs and we expect only FP values in the
+ *    results. In this case, we either do a bitwise comparison or call into getError() directly.
+ *
+ * Note that all operations with a NaN/Inf will lead to either comparison with a NaN, or
+ * inf < thresh, which always return false; so NumericalComparisonFPOnly() will return failure.
+ * So, for the case where we do not expect NaN/Infs, we want to fail if NaN and Infs are
+ * present so that we do not have tests passing when it doesn't make sense to do so.
+ * For an example of such case, think of a triangular solver with zeros on the diagonal.
+ *
+*/
+
+// Enum used to do the correct printing depending on what we aim to compare.
+enum ObjType {
+    SCALAR,
+    VECTOR,
+    MATRIX
+};
+// Enum used to do the correct comparison for NaNs, depending on whether we
+// compare the real or the imaginary component.
+enum ComplexPart {
+    REAL,
+    IMAGINARY
+};
+
+// Helper class to be used to pass info into the Comparators.
+struct ComparisonHelper{
+    double threshold;
+    ObjType object_type;
+    gtint_t i; // used to print vector/matrix elements that we compare
+    gtint_t j; // used to print matrix elements that we compare
+    bool binary_comparison; // By default compare using relative error or absolute error approach.
+    bool nan_inf_check; //By default do not check for NaNs and Infs.
+
+    // Constructor for the case of binary_comparison, no threshold.
+    ComparisonHelper(ObjType object_type) : threshold(-13.0),
+                                            object_type(object_type),
+                                            i(-11),
+                                            j(-11),
+                                            binary_comparison(false),
+                                            nan_inf_check(false) {};
+    // Constructor for the generic case where theshold is used.
+    ComparisonHelper(ObjType object_type, double threshold) : threshold(threshold),
+                                                              object_type(object_type),
+                                                              i(-11),
+                                                              j(-11),
+                                                              binary_comparison(false),
+                                                              nan_inf_check(false) {};
+};
+
+// Generic comparison of f.p. numbers that doesn't check for NaN's and Infs:
+template<typename T>
+testing::AssertionResult NumericalComparisonFPOnly(const char* blis_sol_char,
+                                             const char* ref_sol_char,
+                                             const char* comp_helper_char,
+                                             const T blis_sol,
+                                             const T ref_sol,
+                                             const ComparisonHelper comp_helper,
+                                             const std::string error_message)
+{
+    if (comp_helper.binary_comparison)
+    {
+        if (blis_sol == ref_sol) return testing::AssertionSuccess();
+        return testing::AssertionFailure() << error_message;
+    }
+    else {
+        double error = testinghelpers::getError(blis_sol,ref_sol);
+        if (error < comp_helper.threshold) return testing::AssertionSuccess();
+        return testing::AssertionFailure() << error_message
+                                           << ",    thesh = " << comp_helper.threshold
+                                           << ",    error = " << error;
+    }
+}
+
+// NaN/Inf comparison for real numbers
+template<typename T>
+testing::AssertionResult NumericalComparisonRealNaNInf(const char* blis_sol_char,
+                                             const char* ref_sol_char,
+                                             const char* comp_helper_char,
+                                             const T blis_sol,
+                                             const T ref_sol,
+                                             const ComparisonHelper comp_helper,
+                                             const std::string error_message)
+{
+    // if both are NaN return SUCCESS
+    if ((std::isnan(ref_sol)) && (std::isnan(blis_sol)))
+        return testing::AssertionSuccess();
+    // if only one of them is NaN, return FAILURE
+    else if ((std::isnan(ref_sol)) || (std::isnan(blis_sol)))
+        return testing::AssertionFailure() << error_message;
+    // if both are inf check the sign
+    else if ((std::isinf(ref_sol)) && (std::isinf(blis_sol)))
+    {
+        // check the sign of infs
+        if( ref_sol == blis_sol ) return testing::AssertionSuccess();
+        // both are infs but have different signs, return FAILURE.
+        else return testing::AssertionFailure() << error_message;
+    }
+    // if only one of them is Inf
+    else if ((std::isinf(ref_sol)) || (std::isinf(blis_sol)))
+        return testing::AssertionFailure() << error_message;
+    // If neither reference nor BLIS sol is NaN/Inf do simple comparison, based on relative or absolute error.
+    else return NumericalComparisonFPOnly<T>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol, ref_sol, comp_helper, error_message);
+}
+
+// Comparison for complex numbers in the case of NaNs.
+// Will be re-used for comparison of real and imaginary components.
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+testing::AssertionResult NumericalComparisonNaN(const char* blis_sol_char,
+                                             const char* ref_sol_char,
+                                             const char* comp_helper_char,
+                                             const T blis_sol,
+                                             const T ref_sol,
+                                             const ComparisonHelper comp_helper,
+                                             const ComplexPart complex_part,
+                                             const std::string error_message)
+{
+    // Assign values to intermediate variables as if we are comparing the real part.
+    RT ref_sol_1 = ref_sol.real, ref_sol_2 = ref_sol.imag, blis_sol_1 = blis_sol.real, blis_sol_2 = blis_sol.imag;
+    // if we are comparing based on the imaginary part update the values.
+    if (complex_part == IMAGINARY)
+    {
+        ref_sol_2 = ref_sol.real;
+        ref_sol_1 = ref_sol.imag;
+        blis_sol_2 = blis_sol.real;
+        blis_sol_1 = blis_sol.imag;
+    }
+    // Check if the both parts are NaNs.
+    if ((std::isnan(ref_sol_1)) && (std::isnan(blis_sol_1)))
+        // Check second part for equality based on real NaN/Inf comparison.
+        return NumericalComparisonRealNaNInf<RT>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol_2, ref_sol_2, comp_helper, error_message);
+    // if only one of the first parts is NaN
+    return testing::AssertionFailure() << error_message;
+}
+
+// Comparison for complex numbers in the case of Infs.
+// Will be re-used for comparison of real and imaginary components.
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+testing::AssertionResult NumericalComparisonInf(const char* blis_sol_char,
+                                             const char* ref_sol_char,
+                                             const char* comp_helper_char,
+                                             const T blis_sol,
+                                             const T ref_sol,
+                                             const ComparisonHelper comp_helper,
+                                             const ComplexPart complex_part,
+                                             const std::string error_message)
+{
+    // Assign values to intermediate variables as if we are comparing the real part.
+    RT ref_sol_1 = ref_sol.real, ref_sol_2 = ref_sol.imag, blis_sol_1 = blis_sol.real, blis_sol_2 = blis_sol.imag;
+    // if we are comparing based on the imaginary part update the values.
+    if (complex_part == IMAGINARY)
+    {
+        ref_sol_2 = ref_sol.real;
+        ref_sol_1 = ref_sol.imag;
+        blis_sol_2 = blis_sol.real;
+        blis_sol_1 = blis_sol.imag;
+    }
+    // check if both of the first parts are inf
+    if ((std::isinf(ref_sol_1)) && (std::isinf(blis_sol_1)))
+    {
+        // check the sign of infs
+        if( ref_sol_1 == blis_sol_1 )
+            // Check second part for equality based on real NaN/Inf comparison.
+            return NumericalComparisonRealNaNInf<RT>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol_2, ref_sol_2, comp_helper, error_message);
+        // if both are infs but have different signs, return FAILURE.
+        else return testing::AssertionFailure() << error_message;
+    }
+    // if only one of them is Inf
+    return testing::AssertionFailure() << error_message;
+}
+
+// Comparisons that take into account the presence of NaNs and Infs:
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+testing::AssertionResult NumericalComparison(const char* blis_sol_char,
+                                             const char* ref_sol_char,
+                                             const char* comp_helper_char,
+                                             const T blis_sol,
+                                             const T ref_sol,
+                                             const ComparisonHelper comp_helper)
+{
+    // Base error message used for scalar values
+    std::string error_message = blis_sol_char;
+                error_message += " = " + testinghelpers::to_string(blis_sol) + ",   ";
+                error_message += ref_sol_char;
+                error_message += " = " + testinghelpers::to_string(ref_sol);
+    // If we are comparing a vector, update error message to include the current index
+    if(comp_helper.object_type == VECTOR)
+        error_message += ", i = " + std::to_string(comp_helper.i);
+    // If we are comparing a matrix, update error message to include the current indices
+    else if(comp_helper.object_type == MATRIX)
+        error_message += ", i = " + std::to_string(comp_helper.i) + ", j = " + std::to_string(comp_helper.j);
+
+    // Check if NaN/Inf comparison is necessary and if so, proceed.
+    // Otherwise, call numerical comparison only, without considering NaNs and Infs.
+    if (comp_helper.nan_inf_check)
+    {
+        if constexpr (testinghelpers::type_info<T>::is_real)
+            return NumericalComparisonRealNaNInf<T>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol, ref_sol, comp_helper, error_message);
+        // If it's complex we need to check real and imaginary parts.
+        else
+        {
+            // Check if any of the real parts is NaN, and if so, call into NaN comparator.
+            if ((std::isnan(ref_sol.real)) || (std::isnan(blis_sol.real)))
+            {
+                ComplexPart complex_part = REAL;
+                return NumericalComparisonNaN<T>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol, ref_sol, comp_helper, complex_part, error_message);
+            }
+            // Check if any of the imag parts is NaN, and if so, call into NaN comparator.
+            else if ((std::isnan(ref_sol.imag)) || (std::isnan(blis_sol.imag)))
+            {
+                ComplexPart complex_part = IMAGINARY;
+                return NumericalComparisonNaN<T>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol, ref_sol, comp_helper, complex_part, error_message);
+            }
+            // Check if any of the real parts is Inf, and if so, call into Inf comparator.
+            else if ((std::isinf(ref_sol.real)) || (std::isinf(blis_sol.real)))
+            {
+                ComplexPart complex_part = REAL;
+                return NumericalComparisonInf<T>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol, ref_sol, comp_helper, complex_part, error_message);
+            }
+            // Check if any of the imag parts is NaN or Inf, and if so, call into Inf comparator.
+            else if ((std::isinf(ref_sol.imag)) || (std::isinf(blis_sol.imag)))
+            {
+                ComplexPart complex_part = IMAGINARY;
+                return NumericalComparisonInf<T>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol, ref_sol, comp_helper, complex_part, error_message);
+            }
+            // If neither reference nor BLIS sol is NaN or Inf, or if NaN/Inf checks are not necessary,
+            // do simple comparison, based on relative or absolute error.
+            else
+                return NumericalComparisonFPOnly<T>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol, ref_sol, comp_helper, error_message);
+        }
+    }
+    // If NaN/Inf checks are not necessary, do simple comparison, based on relative or absolute error.
+    else
+        return NumericalComparisonFPOnly<T>(blis_sol_char, ref_sol_char, comp_helper_char, blis_sol, ref_sol, comp_helper, error_message);
+}
+
+/**
+ * Binary comparison of two scalars.
+ */
+template <typename T>
+void computediff( T blis_sol, T ref_sol, bool nan_inf_check = false )
+{
+    ComparisonHelper comp_helper(SCALAR);
+    comp_helper.binary_comparison = true;
+    comp_helper.nan_inf_check = nan_inf_check;
+
+    ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol, ref_sol, comp_helper);
+}
+
+/**
+ * Relative comparison of two scalars, using a threshold.
+ */
+template <typename T>
+void computediff( T blis_sol, T ref_sol, double thresh, bool nan_inf_check = false )
+{
+    ComparisonHelper comp_helper(SCALAR, thresh);
+    ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol, ref_sol, comp_helper);
+}
+
+/**
+ * Binary comparison of two vectors with length n and increment inc.
+ */
+template <typename T>
+void computediff( gtint_t n, T *blis_sol, T *ref_sol, gtint_t inc, bool nan_inf_check = false )
+{
+    gtint_t abs_inc = std::abs(inc);
+    ComparisonHelper comp_helper(VECTOR);
+    comp_helper.nan_inf_check = nan_inf_check;
+    comp_helper.binary_comparison = true;
+
+    // In case inc is negative in a call to BLIS APIs, we just access it from the end to the beginning,
+    // so practically nothing changes. Access from beginning to end to optimize memory operations.
+    for (gtint_t i = 0; i < n; i++)
+    {
+        comp_helper.i = i;
+        ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i*abs_inc], ref_sol[i*abs_inc], comp_helper) << "inc = " << inc;
+        // Go through elements that are part of the array that should not have been modified by the
+        // call to a BLIS API. Use the bitwise comparison for this case.
+        if (i < n-1)
+        {
+            for (gtint_t j = 1; j < abs_inc; j++)
+            {
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i*abs_inc + j], ref_sol[i*abs_inc + j], comp_helper) << "inc = " << inc << " This element is expected to not be modified.";
+            }
+        }
+    }
+}
+
+/**
+ * Relative comparison of two vectors with length n and increment inc.
+ */
+template <typename T>
+void computediff( gtint_t n, T *blis_sol, T *ref_sol, gtint_t inc, double thresh, bool nan_inf_check = false )
+{
+    gtint_t abs_inc = std::abs(inc);
+    ComparisonHelper comp_helper(VECTOR, thresh);
+    comp_helper.nan_inf_check = nan_inf_check;
+
+    // In case inc is negative in a call to BLIS APIs, we just access it from the end to the beginning,
+    // so practically nothing changes. Access from beginning to end to optimize memory operations.
+    for (gtint_t i = 0; i < n; i++)
+    {
+        comp_helper.i = i;
+        ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i*abs_inc], ref_sol[i*abs_inc], comp_helper) << "inc = " << inc;
+        // Go through elements that are part of the array that should not have been modified by the
+        // call to a BLIS API. Use the bitwise comparison for this case.
+        if (i < n-1)
+        {
+            for (gtint_t j = 1; j < abs_inc; j++)
+            {
+                comp_helper.binary_comparison = true;
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i*abs_inc + j], ref_sol[i*abs_inc + j], comp_helper) << "inc = " << inc << " This element is expected to not be modified.";
+            }
+            comp_helper.binary_comparison = false;
+        }
+    }
+}
+
+/**
+ * Binary comparison of two matrices with dimensions m-by-n and leading dimension ld.
+ */
+template <typename T>
+void computediff(char storage, gtint_t m, gtint_t n, T *blis_sol, T *ref_sol, gtint_t ld, bool nan_inf_check = false )
+{
+    gtint_t i,j;
+    ComparisonHelper comp_helper(MATRIX);
+    comp_helper.nan_inf_check = nan_inf_check;
+    comp_helper.binary_comparison = true;
+    // Loop for column-major order
+    if( (storage == 'c') || (storage == 'C') )
+    {
+        for( j = 0 ; j < n ; j++ )
+        {
+            // First iterate through the elements of the arrays that are part of the matrix
+            // and are expected to be modified by a call to BLIS APIs.
+            for( i = 0 ; i < m ; i++ )
+            {
+                comp_helper.i = i;
+                comp_helper.j = j;
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i + j*ld], ref_sol[i + j*ld], comp_helper);
+            }
+            // Now iterate through the rest of elements in memory space that are not part of the matrix,
+            // so we use binary comparison to verify that are exactly the same as the reference.
+            // Since to get create the data we use a copy to initialize the reference results, those
+            // elements are expected to identical.
+            for (i = m; i < ld; i++)
+            {
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i + j*ld], ref_sol[i + j*ld], comp_helper) << "This element is expected to not be modified.";
+            }
+        }
+    }
+    // Loop for row-major order
+    else
+    {
+        for( i = 0 ; i < m ; i++ )
+        {
+            // First iterate through the elements of the arrays that are part of the matrix
+            // and are expected to be modified by a call to BLIS APIs.
+            for( j = 0 ; j < n ; j++ )
+            {
+                comp_helper.i = i;
+                comp_helper.j = j;
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i*ld + j], ref_sol[i*ld + j], comp_helper);
+            }
+            // Now iterate through the rest of elements in memory space that are not part of the matrix,
+            // so we use binary comparison to verify that are exactly the same as the reference.
+            // Since to get create the data we use a copy to initialize the reference results, those
+            // elements are expected to identical.
+            for (j = n; j < ld; j++)
+            {
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i*ld + j], ref_sol[i*ld + j], comp_helper) << "This element is expected to not be modified.";
+            }
+        }
+    }
+}
+
+/**
+ * Relative comparison of two matrices with dimensions m-by-n and leading dimension ld.
+ */
+template <typename T>
+void computediff(char storage, gtint_t m, gtint_t n, T *blis_sol, T *ref_sol, gtint_t ld, double thresh, bool nan_inf_check = false )
+{
+    gtint_t i,j;
+    ComparisonHelper comp_helper(MATRIX, thresh);
+    comp_helper.nan_inf_check = nan_inf_check;
+
+    // Loop for column-major order
+    if( (storage == 'c') || (storage == 'C') )
+    {
+        for( j = 0 ; j < n ; j++ )
+        {
+            // First iterate through the elements of the arrays that are part of the matrix
+            // and are expected to be modified by a call to BLIS APIs.
+            for( i = 0 ; i < m ; i++ )
+            {
+                comp_helper.i = i;
+                comp_helper.j = j;
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i + j*ld], ref_sol[i + j*ld], comp_helper);
+            }
+            // Now iterate through the rest of elements in memory space that are not part of the matrix,
+            // so we use binary comparison to verify that are exactly the same as the reference.
+            // Since to get create the data we use a copy to initialize the reference results, those
+            // elements are expected to identical.
+            comp_helper.binary_comparison = true;
+            for (i = m; i < ld; i++)
+            {
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i + j*ld], ref_sol[i + j*ld], comp_helper) << "This element is expected to not be modified.";
+            }
+            // Disable binary comparison before we go through the next column.
+            comp_helper.binary_comparison = false;
+        }
+    }
+    // Loop for row-major order
+    else
+    {
+        for( i = 0 ; i < m ; i++ )
+        {
+            // First iterate through the elements of the arrays that are part of the matrix
+            // and are expected to be modified by a call to BLIS APIs.
+            for( j = 0 ; j < n ; j++ )
+            {
+                comp_helper.i = i;
+                comp_helper.j = j;
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i*ld + j], ref_sol[i*ld + j], comp_helper);
+            }
+            // Now iterate through the rest of elements in memory space that are not part of the matrix,
+            // so we use binary comparison to verify that are exactly the same as the reference.
+            // Since to get create the data we use a copy to initialize the reference results, those
+            // elements are expected to identical.
+            comp_helper.binary_comparison = true;
+            for (j = n; j < ld; j++)
+            {
+                ASSERT_PRED_FORMAT3(NumericalComparison<T>, blis_sol[i*ld + j], ref_sol[i*ld + j], comp_helper) << "This element is expected to not be modified.";
+            }
+            // Disable binary comparison before we go through the next column.
+            comp_helper.binary_comparison = false;
+        }
+    }
+}
diff --git a/gtestsuite/testsuite/inc/utils.h b/gtestsuite/testsuite/inc/utils.h
new file mode 100644
index 0000000000..ded4e98f92
--- /dev/null
+++ b/gtestsuite/testsuite/inc/utils.h
@@ -0,0 +1,211 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#pragma once
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/*
+ * ==========================================================================
+ * MKHERM
+ * Make an m x m matrix A explicitly Hermitian by copying the conjugate
+ * of the triangle specified by uploa to the opposite triangle. Imaginary
+ * components of diagonal elements are explicitly set to zero.
+ * It is assumed that the diagonal offset of A is zero.
+ * ==========================================================================
+ */
+template<typename T>
+static void mkherm( char storage, char uplo, gtint_t n, T* ap, gtint_t lda )
+{
+    uplo_t  uploa;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_smkherm( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dmkherm( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cmkherm( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zmkherm( uploa, n, ap, rsa, csa );
+    else
+
+        throw std::runtime_error("Error in utils.h: Invalid typename in mkherm().");
+}
+
+/*
+ * ==========================================================================
+ * MKSYMM
+ * Make an m x m matrix A explicitly symmetric by copying the triangle
+ * specified by uploa to the opposite triangle.
+ * It is assumed that the diagonal offset of A is zero.
+ * ==========================================================================
+ */
+
+template<typename T>
+static void mksymm( char storage, char uplo, gtint_t n, T* ap, gtint_t lda )
+{
+    uplo_t  uploa;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_smksymm( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dmksymm( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cmksymm( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zmksymm( uploa, n, ap, rsa, csa );
+    else
+
+        throw std::runtime_error("Error in utils.h: Invalid typename in mksymm().");
+}
+
+/*
+ * ==========================================================================
+ * MKTRIM
+ * Make an m x m matrix A explicitly triangular by preserving the triangle
+ * specified by uploa and zeroing the elements in the opposite triangle.
+ * It is assumed that the diagonal offset of A is zero
+ * ==========================================================================
+ */
+template<typename T>
+static void mktrim( char storage, char uplo, gtint_t n, T* ap, gtint_t lda )
+{
+    uplo_t  uploa;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_smktrim( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dmktrim( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cmktrim( uploa, n, ap, rsa, csa );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zmktrim( uploa, n, ap, rsa, csa );
+    else
+
+        throw std::runtime_error("Error in utils.h: Invalid typename in mktrim().");
+}
+
+template<typename T>
+static void print( T x, const char *spec ) {
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        printf(spec, x);
+    else {
+        printf( spec, x.real );
+        if(x.imag < 0)    printf( " -" );
+        else              printf( " +" );
+        printf( spec, abs(x.imag) );
+        printf( " " );
+    }
+}
+
+template<typename T>
+void printmat( const char *mat, char storage, gtint_t m, gtint_t n, T *a, gtint_t ld, const char *spec )
+{
+    dim_t i, j;
+    dim_t rs,cs;
+    rs=cs=1;
+    T val;
+    if( (storage == 'c') || (storage == 'C') )
+        cs = ld ;
+    else
+        rs = ld ;
+
+    std::cout <<"matrix : " <<  mat <<  std::endl;
+
+    for ( i = 0; i < m; i++ )
+    {
+        for ( j = 0; j < n; j++ )
+        {
+            val = a[i*rs + j*cs];
+            print<T>(val,spec);
+            printf( " " );
+        }
+        printf( "\n" );
+    }
+    printf( "\n" );
+}
+
+template<typename T>
+void printvec( const char *vec, gtint_t n, T *x, gtint_t incx, const char *spec )
+{
+    dim_t i, idx;
+    T val;
+
+    std::cout <<"vector : " <<  vec <<  std::endl;
+
+    for ( i = 0; i < n; i++ )
+    {
+        idx = (incx > 0) ? (i * incx) : ( - ( n - i - 1 ) * incx );
+        val = x[idx];
+        print<T>(val,spec);
+        printf( " " );
+    }
+    printf( "\n\n" );
+}
+
diff --git a/gtestsuite/testsuite/level1/addv/addv.h b/gtestsuite/testsuite/level1/addv/addv.h
new file mode 100644
index 0000000000..ed392dedc5
--- /dev/null
+++ b/gtestsuite/testsuite/level1/addv/addv.h
@@ -0,0 +1,82 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Computes
+ *             y := y + x or y := y + conj(x)
+ *        This is a BLIS-specific API, not part of BLAS/CBLAS.
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation
+ * @param[in] n vector length of x and y
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[in, out] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ */
+
+template<typename T>
+static void typed_addv(char conj_x, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+    conj_t conjx;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_x, &conjx );
+    if constexpr (std::is_same<T, float>::value)
+        bli_saddv( conjx, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_daddv( conjx, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_caddv( conjx, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zaddv( conjx, n, x, incx, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/addv.h: Invalid typename in typed_addv().");
+}
+
+template<typename T>
+static void addv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+#ifdef TEST_BLAS
+    throw std::runtime_error("Error in testsuite/level1/addv.h: BLAS interface is not available.");
+#elif TEST_CBLAS
+    throw std::runtime_error("Error in testsuite/level1/addv.h: CBLAS interface is not available.");
+#elif TEST_BLIS_TYPED
+    typed_addv(conjx, n, x, incx, y, incy);
+#else
+    throw std::runtime_error("Error in testsuite/level1/addv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/addv/caddv_generic.cpp b/gtestsuite/testsuite/level1/addv/caddv_generic.cpp
new file mode 100644
index 0000000000..94f3621c5b
--- /dev/null
+++ b/gtestsuite/testsuite/level1/addv/caddv_generic.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_addv.h"
+
+class caddvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t, gtint_t, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(caddvGenericTest);
+
+TEST_P( caddvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_addv<T>(conj_x, n, incx, incy, thresh, datatype);
+}
+
+// Prints the test case combination
+class caddvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        char datatype  = std::get<4>(str.param);
+        std::string str_name = "bli_caddv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        caddvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: not transpose for x, c: conjugate for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::caddvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/addv/daddv_generic.cpp b/gtestsuite/testsuite/level1/addv/daddv_generic.cpp
new file mode 100644
index 0000000000..e9d5835cba
--- /dev/null
+++ b/gtestsuite/testsuite/level1/addv/daddv_generic.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_addv.h"
+
+class daddvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t, gtint_t, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(daddvGenericTest);
+
+TEST_P( daddvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_addv<T>(conj_x, n, incx, incy, thresh, datatype);
+}
+
+// Prints the test case combination
+class daddvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        char datatype  = std::get<4>(str.param);
+        std::string str_name = "bli_daddv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        daddvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: not transpose for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::daddvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/addv/saddv_generic.cpp b/gtestsuite/testsuite/level1/addv/saddv_generic.cpp
new file mode 100644
index 0000000000..0d1da47652
--- /dev/null
+++ b/gtestsuite/testsuite/level1/addv/saddv_generic.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_addv.h"
+
+class saddvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t, gtint_t, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(saddvGenericTest);
+
+TEST_P( saddvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_addv<T>(conj_x, n, incx, incy, thresh, datatype);
+}
+
+// Prints the test case combination
+class saddvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        char datatype  = std::get<4>(str.param);
+        std::string str_name = "bli_saddv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        saddvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: not transpose for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saddvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/addv/test_addv.h b/gtestsuite/testsuite/level1/addv/test_addv.h
new file mode 100644
index 0000000000..0b66675b65
--- /dev/null
+++ b/gtestsuite/testsuite/level1/addv/test_addv.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "addv.h"
+#include "level1/ref_addv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for addv operation.
+ */
+
+template<typename T>
+void test_addv( char conjx, gtint_t n, gtint_t incx, gtint_t incy,
+               double thresh, char datatype ) {
+
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-10, 10, n, incy, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+    testinghelpers::ref_addv<T>(conjx, n, x.data(), incx, y_ref.data(), incy);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    addv(conjx, n, x.data(), incx, y.data(), incy);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp b/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp
new file mode 100644
index 0000000000..ed7796d36b
--- /dev/null
+++ b/gtestsuite/testsuite/level1/addv/zaddv_generic.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_addv.h"
+
+class ZAddvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t, gtint_t, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ZAddvGenericTest);
+
+TEST_P( ZAddvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_addv<T>(conj_x, n, incx, incy, thresh, datatype);
+}
+
+// Prints the test case combination
+class ZAddvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        char datatype  = std::get<4>(str.param);
+        std::string str_name = "bli_zaddv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ZAddvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: not transpose for x, c: conjugate for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ZAddvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/amaxv/amaxv.h b/gtestsuite/testsuite/level1/amaxv/amaxv.h
new file mode 100644
index 0000000000..4479263e2b
--- /dev/null
+++ b/gtestsuite/testsuite/level1/amaxv/amaxv.h
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Finds the index of the first element that has the maximum absolute value.
+ * @param[in] n vector length of x and y
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ *
+ * If n < 1 or incx <= 0, return 0.
+ */
+
+template<typename T>
+static gtint_t amaxv_(gtint_t n, T* x, gtint_t incx) {
+
+    gtint_t idx;
+    if constexpr (std::is_same<T, float>::value)
+        idx = isamax_( &n, x, &incx );
+    else if constexpr (std::is_same<T, double>::value)
+        idx = idamax_( &n, x, &incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        idx = icamax_( &n, x, &incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        idx = izamax_( &n, x, &incx );
+    else
+      throw std::runtime_error("Error in testsuite/level1/amaxv.h: Invalid typename in amaxv_().");
+
+    // Since we are comparing against CBLAS which is 0-based and BLAS is 1-based,
+    // we need to use -1 here.
+    return (idx-1);
+}
+
+template<typename T>
+static gtint_t cblas_amaxv(gtint_t n, T* x, gtint_t incx) {
+
+    gtint_t idx;
+    if constexpr (std::is_same<T, float>::value)
+      idx = cblas_isamax( n, x, incx );
+    else if constexpr (std::is_same<T, double>::value)
+      idx = cblas_idamax( n, x, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+      idx = cblas_icamax( n, x, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+      idx = cblas_izamax( n, x, incx );
+    else
+      throw std::runtime_error("Error in testsuite/level1/amaxv.h: Invalid typename in cblas_amaxv().");
+
+    return idx;
+}
+
+template<typename T>
+static gtint_t typed_amaxv(gtint_t n, T* x, gtint_t incx)
+{
+    gtint_t idx = 0;
+    if constexpr (std::is_same<T, float>::value)
+        bli_samaxv( n, x, incx, &idx );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_damaxv( n, x, incx, &idx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_camaxv( n, x, incx, &idx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zamaxv( n, x, incx, &idx );
+    else
+        throw std::runtime_error("Error in testsuite/level1/amaxddv.h: Invalid typename in typed_amaxv().");
+
+    return idx;
+}
+
+template<typename T>
+static gtint_t amaxv(gtint_t n, T* x, gtint_t incx)
+{
+#ifdef TEST_BLAS
+    return amaxv_<T>(n, x, incx);
+#elif TEST_CBLAS
+    return cblas_amaxv<T>(n, x, incx);
+#elif TEST_BLIS_TYPED
+    return typed_amaxv(n, x, incx);
+#else
+    throw std::runtime_error("Error in testsuite/level1/amaxv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp
new file mode 100644
index 0000000000..d6dcd7f282
--- /dev/null
+++ b/gtestsuite/testsuite/level1/amaxv/camaxv_generic.cpp
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_amaxv.h"
+
+class camaxvGenericTest :
+        public ::testing::TestWithParam<std::tuple<gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( camaxvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<1>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<2>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_amaxv<T>(n, incx, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class camaxvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t,gtint_t,char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        gtint_t incx  = std::get<1>(str.param);
+        char datatype = std::get<2>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "icamax_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_icamax";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_camaxv";
+#endif
+        str_name += "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of camaxv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        camaxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::camaxvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        camaxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::camaxvGenericTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp
new file mode 100644
index 0000000000..f95871c8d2
--- /dev/null
+++ b/gtestsuite/testsuite/level1/amaxv/damaxv_generic.cpp
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_amaxv.h"
+
+class damaxvGenericTest :
+        public ::testing::TestWithParam<std::tuple<gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( damaxvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<1>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<2>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_amaxv<T>(n, incx, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class damaxvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t,gtint_t,char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        gtint_t incx  = std::get<1>(str.param);
+        char datatype = std::get<2>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "idamax_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_idamax";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_damaxv";
+#endif
+        str_name += "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of samaxv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        damaxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::damaxvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        damaxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::damaxvGenericTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp
new file mode 100644
index 0000000000..11aa87c216
--- /dev/null
+++ b/gtestsuite/testsuite/level1/amaxv/samaxv_generic.cpp
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_amaxv.h"
+
+class samaxvGenericTest :
+        public ::testing::TestWithParam<std::tuple<gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( samaxvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<1>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<2>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_amaxv<T>(n, incx, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class samaxvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t,gtint_t,char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        gtint_t incx  = std::get<1>(str.param);
+        char datatype = std::get<2>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "isamax_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_isamax";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_samaxv";
+#endif
+        str_name += "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of samaxv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        samaxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // n size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::samaxvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        samaxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::samaxvGenericTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level1/amaxv/test_amaxv.h b/gtestsuite/testsuite/level1/amaxv/test_amaxv.h
new file mode 100644
index 0000000000..e723cc33da
--- /dev/null
+++ b/gtestsuite/testsuite/level1/amaxv/test_amaxv.h
@@ -0,0 +1,67 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "amaxv.h"
+#include "level1/ref_amaxv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for amaxv operation.
+ */
+
+template<typename T>
+void test_amaxv( gtint_t n, gtint_t incx, double thresh, char datatype ) {
+
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    gtint_t idx_ref = testinghelpers::ref_amaxv<T>(n, x.data(), incx);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    gtint_t idx = amaxv(n, x.data(), incx);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    computediff<gtint_t>( idx, idx_ref );
+}
diff --git a/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp b/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp
new file mode 100644
index 0000000000..d8534c3da6
--- /dev/null
+++ b/gtestsuite/testsuite/level1/amaxv/zamaxv_generic.cpp
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_amaxv.h"
+
+class zamaxvGenericTest :
+        public ::testing::TestWithParam<std::tuple<gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( zamaxvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<1>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<2>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_amaxv<T>(n, incx, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zamaxvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t,gtint_t,char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        gtint_t incx  = std::get<1>(str.param);
+        char datatype = std::get<2>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "izamax_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_izamax";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_zamaxv";
+#endif
+        str_name += "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of zamaxv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zamaxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zamaxvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        zamaxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zamaxvGenericTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level1/axpbyv/axpbyv.h b/gtestsuite/testsuite/level1/axpbyv/axpbyv.h
new file mode 100644
index 0000000000..0c415e1b0c
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpbyv/axpbyv.h
@@ -0,0 +1,114 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *             y := beta * y + alpha * x
+ *          or y := beta * y + alpha * conj(x) (BLIS_TYPED only)
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation (BLIS API specific)
+ * @param[in] n vector length of x and y
+ * @param[in] alpha scalar
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[in] beta scalar
+ * @param[in, out] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ */
+
+template<typename T>
+static void axpbyv_(gtint_t n, T alpha, T* x, gtint_t incx, T beta, T* y, gtint_t incy)
+{
+    if constexpr (std::is_same<T, float>::value)
+        saxpby_( &n, &alpha, x, &incx, &beta, y, &incy );
+    else if constexpr (std::is_same<T, double>::value)
+        daxpby_( &n, &alpha, x, &incx, &beta, y, &incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        caxpby_( &n, &alpha, x, &incx, &beta, y, &incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zaxpby_( &n, &alpha, x, &incx, &beta, y, &incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/axpbyv.h: Invalid typename in axpbyv_().");
+}
+
+template<typename T>
+static void cblas_axpbyv(gtint_t n, T alpha, T* x, gtint_t incx, T beta, T* y, gtint_t incy)
+{
+    if constexpr (std::is_same<T, float>::value)
+        cblas_saxpby( n, alpha, x, incx, beta, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_daxpby( n, alpha, x, incx, beta, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_caxpby( n, &alpha, x, incx, &beta, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zaxpby( n, &alpha, x, incx, &beta, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/axpbyv.h: Invalid typename in cblas_axpbyv().");
+}
+
+template<typename T>
+static void typed_axpbyv(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T beta, T* y, gtint_t incy)
+{
+    conj_t conjx;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_x, &conjx );
+    if constexpr (std::is_same<T, float>::value)
+        bli_saxpbyv( conjx, n, &alpha, x, incx, &beta, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_daxpbyv( conjx, n, &alpha, x, incx, &beta, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_caxpbyv( conjx, n, &alpha, x, incx, &beta, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zaxpbyv( conjx, n, &alpha, x, incx, &beta, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/axpbyv.h: Invalid typename in typed_axpbyv().");
+}
+
+template<typename T>
+static void axpbyv(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T beta, T* y, gtint_t incy)
+{
+#ifdef TEST_BLAS
+    axpbyv_<T>( n, alpha, x, incx, beta, y, incy );
+#elif TEST_CBLAS
+    cblas_axpbyv<T>( n, alpha, x, incx, beta, y, incy );
+#elif TEST_BLIS_TYPED
+    typed_axpbyv<T>( conj_x, n, alpha, x, incx, beta, y, incy );
+#else
+    throw std::runtime_error("Error in testsuite/level1/axpbyv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp
new file mode 100644
index 0000000000..e4a4c80c03
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpbyv/caxpbyv_generic.cpp
@@ -0,0 +1,177 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_axpbyv.h"
+
+class caxpbyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   char>> {};
+// Tests using random integers as vector elements.
+TEST_P( caxpbyvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // alpha
+    T alpha = std::get<4>(GetParam());
+    // beta
+    T beta = std::get<5>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<6>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_axpbyv<T>(conj_x, n, incx, incy, alpha, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class caxpbyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,scomplex,scomplex,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        scomplex alpha = std::get<4>(str.param);
+        scomplex beta  = std::get<5>(str.param);
+        char datatype  = std::get<6>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "caxpby_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_caxpby";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_caxpbyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of caxpby.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        caxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'                                            // n: use x, c: use conj(x)
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(scomplex{1.0, 2.0}),                           // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::caxpbyvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        caxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2)),                                   // stride size for x
+            ::testing::Values(gtint_t(3)),                                   // stride size for y
+            ::testing::Values(scomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(scomplex{1.0, -2.0}),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::caxpbyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        caxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-11), gtint_t(5)),                    // stride size for x
+            ::testing::Values(gtint_t(-3), gtint_t(7)),                      // stride size for y
+            ::testing::Values(scomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(scomplex{1.0, -2.0}),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::caxpbyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp
new file mode 100644
index 0000000000..efc2770ab2
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpbyv/daxpbyv_generic.cpp
@@ -0,0 +1,191 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_axpbyv.h"
+
+class daxpbyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   char>> {};
+// Tests using random integers as vector elements.
+TEST_P( daxpbyvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // alpha
+    T alpha = std::get<4>(GetParam());
+    // beta
+    T beta = std::get<5>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<6>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_axpbyv<T>(conj_x, n, incx, incy, alpha, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class daxpbyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,double,double,char>> str) const {
+        char conj     = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        double alpha  = std::get<4>(str.param);
+        double beta   = std::get<5>(str.param);
+        char datatype = std::get<6>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "daxpby_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_daxpby";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_daxpbyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of caxpy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        daxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                   // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),          // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                            // stride size for x
+            ::testing::Values(gtint_t(1)),                            // stride size for y
+            ::testing::Values(double(2.0), double(-2.0)),             // alpha
+            ::testing::Values(double(-1.0)),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                    // i : integer, f : float  datatype type tested
+        ),
+        ::daxpbyvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        daxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(double(2.0)),                                  // alpha
+            ::testing::Values(double(1.0)),                                  // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::daxpbyvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        daxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(7)),                                   // stride size for x
+            ::testing::Values(gtint_t(3)),                                   // stride size for y
+            ::testing::Values(4.0),                                          // alpha
+            ::testing::Values(-2.0),                                         // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::daxpbyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        daxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(11), gtint_t(-11)),                    // stride size for x
+            ::testing::Values(gtint_t(-3), gtint_t(4)),                      // stride size for y
+            ::testing::Values(4.0),                                          // alpha
+            ::testing::Values(-2.0),                                         // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::daxpbyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp
new file mode 100644
index 0000000000..6f0cf3b8be
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpbyv/saxpbyv_generic.cpp
@@ -0,0 +1,187 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_axpbyv.h"
+
+class saxpbyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   char>> {};
+// Tests using random integers as vector elements.
+TEST_P( saxpbyvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // alpha
+    T alpha = std::get<4>(GetParam());
+    // beta
+    T beta = std::get<5>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<6>(GetParam());
+
+    // Set the threshold for the errors:
+    float thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_axpbyv<T>(conj_x, n, incx, incy, alpha, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class saxpbyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,float,float,char>> str) const {
+        char conj     = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        float alpha   = std::get<4>(str.param);
+        float beta    = std::get<5>(str.param);
+        char datatype = std::get<6>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "saxpby_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_saxpby";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_saxpbyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of caxpy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        saxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(float(2.0), float(-2.0)),                      // alpha
+            ::testing::Values(float(-1.0)),                                  // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saxpbyvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        saxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(float(2.0)),                                   // alpha
+            ::testing::Values(float(1.0)),                                   // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saxpbyvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        saxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // use x, not conj(x) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(11)),                                  /*(gtint_t(-5), gtint_t(-17))*/// stride size for x
+            ::testing::Values(gtint_t(3)),                                   /*(gtint_t(-12), gtint_t(-4))*/// stride size for y
+            ::testing::Values(float(4.0)),                                   // alpha
+            ::testing::Values(float(2.0)),                                   // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saxpbyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        saxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(11), gtint_t(-11)),                    // stride size for x
+            ::testing::Values(gtint_t(-3), gtint_t(4)),                      // stride size for y
+            ::testing::Values(4.0),                                          // alpha
+            ::testing::Values(-2.0),                                         // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saxpbyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h b/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h
new file mode 100644
index 0000000000..cf6156f141
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpbyv/test_axpbyv.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "axpbyv.h"
+#include "level1/ref_axpbyv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for axpby operation.
+ */
+
+template<typename T>
+static void test_axpbyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy,
+    T alpha, T beta, double thresh, char datatype ) {
+
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-10, 10, n, incy, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+    testinghelpers::ref_axpbyv<T>(conjx, n, alpha, x.data(), incx, beta, y_ref.data(), incy);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    axpbyv<T>(conjx, n, alpha, x.data(), incx, beta, y.data(), incy);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp
new file mode 100644
index 0000000000..690b7d4784
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpbyv/zaxpbyv_generic.cpp
@@ -0,0 +1,177 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_axpbyv.h"
+
+class zaxpbyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   char>> {};
+// Tests using random integers as vector elements.
+TEST_P( zaxpbyvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // alpha
+    T alpha = std::get<4>(GetParam());
+    // beta
+    T beta = std::get<5>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<6>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_axpbyv<T>(conj_x, n, incx, incy, alpha, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zaxpbyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,dcomplex,dcomplex,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        dcomplex alpha = std::get<4>(str.param);
+        dcomplex beta  = std::get<5>(str.param);
+        char datatype  = std::get<6>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zaxpby_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zaxpby";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_zaxpbyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of zaxpby.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zaxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'                                            // n: use x, c: use conj(x)
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(dcomplex{-3.0, 1.0}, dcomplex{1.0, 2.0}),      // alpha
+            ::testing::Values(dcomplex{1.0, 2.0}),                           // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zaxpbyvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        zaxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2)),                                   /*(gtint_t(-5), gtint_t(-17))*/// stride size for x
+            ::testing::Values(gtint_t(4)),                                   /*(gtint_t(-12), gtint_t(-4))*/// stride size for y
+            ::testing::Values(dcomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(dcomplex{1.0, 2.0}),                           // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zaxpbyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zaxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(11), gtint_t(-11)),                    // stride size for x
+            ::testing::Values(gtint_t(-3), gtint_t(4)),                      // stride size for y
+            ::testing::Values(dcomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(dcomplex{1.0, -2.0}),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zaxpbyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/axpyv/axpyv.h b/gtestsuite/testsuite/level1/axpyv/axpyv.h
new file mode 100644
index 0000000000..10e56cae15
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpyv/axpyv.h
@@ -0,0 +1,113 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *             y := y + alpha * x
+ *          or y := y + alpha * conj(x) BLIS_TYPED only
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation (BLIS API specific)
+ * @param[in] n vector length of x and y
+ * @param[in] alpha scalar
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[in, out] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ */
+
+template<typename T>
+static void axpyv_(gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+    if constexpr (std::is_same<T, float>::value)
+        saxpy_( &n, &alpha, x, &incx, y, &incy );
+    else if constexpr (std::is_same<T, double>::value)
+        daxpy_( &n, &alpha, x, &incx, y, &incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        caxpy_( &n, &alpha, x, &incx, y, &incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zaxpy_( &n, &alpha, x, &incx, y, &incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/axpyv.h: Invalid typename in axpyv_().");
+}
+
+template<typename T>
+static void cblas_axpyv(gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+    if constexpr (std::is_same<T, float>::value)
+        cblas_saxpy( n, alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_daxpy( n, alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_caxpy( n, &alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zaxpy( n, &alpha, x, incx, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/axpyv.h: Invalid typename in cblas_axpyv().");
+}
+
+template<typename T>
+static void typed_axpyv(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+    conj_t conjx;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_x, &conjx );
+    if constexpr (std::is_same<T, float>::value)
+        bli_saxpyv( conjx, n, &alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_daxpyv( conjx, n, &alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_caxpyv( conjx, n, &alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zaxpyv( conjx, n, &alpha, x, incx, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/axpyv.h: Invalid typename in typed_axpyv().");
+}
+
+template<typename T>
+static void axpyv(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+#ifdef TEST_BLAS
+    axpyv_<T>( n, alpha, x, incx, y, incy );
+#elif TEST_CBLAS
+    cblas_axpyv<T>( n, alpha, x, incx, y, incy );
+#elif TEST_BLIS_TYPED
+    typed_axpyv<T>( conj_x, n, alpha, x, incx, y, incy );
+#else
+    throw std::runtime_error("Error in testsuite/level1/axpyv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp
new file mode 100644
index 0000000000..77cd26c285
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpyv/caxpyv_generic.cpp
@@ -0,0 +1,167 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_axpyv.h"
+
+class caxpyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   char>> {};
+// Tests using random integers as vector elements.
+TEST_P( caxpyvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // alpha
+    T alpha = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_axpyv<T>(conj_x, n, incx, incy, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class caxpyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,scomplex,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        scomplex alpha = std::get<4>(str.param);
+        char datatype  = std::get<5>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "caxpy_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_caxpy";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_caxpyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                  alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of caxpy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        caxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::caxpyvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        caxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2)),                                   // stride size for x
+            ::testing::Values(gtint_t(3)),                                   // stride size for y
+            ::testing::Values(scomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::caxpyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        caxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-4)),                                  // stride size for x
+            ::testing::Values(gtint_t(-3)),                                  // stride size for y
+            ::testing::Values(scomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::caxpyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp
new file mode 100644
index 0000000000..792d582782
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpyv/daxpyv_generic.cpp
@@ -0,0 +1,177 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_axpyv.h"
+
+class daxpyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   char>> {};
+// Tests using random integers as vector elements.
+TEST_P( daxpyvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // alpha
+    T alpha = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_axpyv<T>(conj_x, n, incx, incy, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class daxpyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,double,char>> str) const {
+        char conj     = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        double alpha  = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "daxpy_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_daxpy";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_daxpyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of caxpy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        daxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(double(2.0), double(-2.0)),                    // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::daxpyvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        daxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(double(2.0)),                                  // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::daxpyvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        daxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2)),                                   /*(gtint_t(-5), gtint_t(-17))*/// stride size for x
+            ::testing::Values(gtint_t(3)),                                   /*(gtint_t(-12), gtint_t(-4))*/// stride size for y
+            ::testing::Values(double(4.0)),                                  // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::daxpyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        daxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-4)),                                  // stride size for x
+            ::testing::Values(gtint_t(-3)),                                  // stride size for y
+            ::testing::Values(4.0),                                          // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::daxpyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp
new file mode 100644
index 0000000000..67699e8337
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpyv/saxpyv_generic.cpp
@@ -0,0 +1,177 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_axpyv.h"
+
+class saxpyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   char>> {};
+// Tests using random integers as vector elements.
+TEST_P( saxpyvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // alpha
+    T alpha = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_axpyv<T>(conj_x, n, incx, incy, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class saxpyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,float,char>> str) const {
+        char conj     = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        float alpha   = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+#ifdef TEST_BLAS
+      std::string str_name = "saxpy_";
+#elif TEST_CBLAS
+      std::string str_name = "cblas_saxpy";
+#else  //#elif TEST_BLIS_TYPED
+      std::string str_name = "bli_saxpyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of saxpy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        saxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(float(2.0), float(-2.0)),                      // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saxpyvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        saxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(float(2.0)),                                   // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saxpyvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        saxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(-2)),                      /*(gtint_t(-5), gtint_t(-17))*/// stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(-3)),                      /*(gtint_t(-12), gtint_t(-4))*/// stride size for y
+            ::testing::Values(float(4.0)),                                   // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saxpyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        saxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-4)),                                  // stride size for x
+            ::testing::Values(gtint_t(-3)),                                  // stride size for y
+            ::testing::Values(4.0),                                          // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::saxpyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/axpyv/test_axpyv.h b/gtestsuite/testsuite/level1/axpyv/test_axpyv.h
new file mode 100644
index 0000000000..a2d6af583f
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpyv/test_axpyv.h
@@ -0,0 +1,70 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "axpyv.h"
+#include "level1/ref_axpyv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for axpyv operation.
+ */
+
+template<typename T>
+static void test_axpyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy,
+    T alpha, double thresh, char datatype ) {
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-10, 10, n, incy, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+    testinghelpers::ref_axpyv<T>(conjx, n, alpha, x.data(), incx, y_ref.data(), incy);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    axpyv<T>(conjx, n, alpha, x.data(), incx, y.data(), incy);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp b/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp
new file mode 100644
index 0000000000..a8cf1a6983
--- /dev/null
+++ b/gtestsuite/testsuite/level1/axpyv/zaxpyv_generic.cpp
@@ -0,0 +1,166 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_axpyv.h"
+
+class zaxpyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   char>> {};
+// Tests using random integers as vector elements.
+TEST_P( zaxpyvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // alpha
+    T alpha = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*testinghelpers::getEpsilon<T>();
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_axpyv<T>(conj_x, n, incx, incy, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zaxpyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,dcomplex,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        dcomplex alpha = std::get<4>(str.param);
+        char datatype  = std::get<5>(str.param);
+#ifdef TEST_BLAS
+      std::string str_name = "zaxpy_";
+#elif TEST_CBLAS
+      std::string str_name = "cblas_zaxpy";
+#else  //#elif TEST_BLIS_TYPED
+      std::string str_name = "bli_zaxpyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                  alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of zaxpy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zaxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(dcomplex{-3.0, 1.0}, dcomplex{1.0, 2.0}),      // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zaxpyvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        zaxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2)),                                   // stride size for x
+            ::testing::Values(gtint_t(3)),                                   // stride size for y
+            ::testing::Values(dcomplex{-1.0, 2.0}),                          // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zaxpyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zaxpyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-4)),                                  // stride size for x
+            ::testing::Values(gtint_t(-3)),                                  // stride size for y
+            ::testing::Values(dcomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zaxpyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp
new file mode 100644
index 0000000000..5186cdecb5
--- /dev/null
+++ b/gtestsuite/testsuite/level1/copyv/ccopyv_generic.cpp
@@ -0,0 +1,158 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_copyv.h"
+
+class ccopyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( ccopyvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether vec x is n,c
+    char conjx = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_copyv<T>(conjx, n, incx, incy, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class ccopyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conjx    = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        char datatype = std::get<4>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ccopy_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ccopy";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_ccopyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of ccopy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ccopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ccopyvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        ccopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ccopyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        ccopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(-5), gtint_t(7)),                      // stride size for x
+            ::testing::Values(gtint_t(13), gtint_t(-9)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ccopyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/copyv/copyv.h b/gtestsuite/testsuite/level1/copyv/copyv.h
new file mode 100644
index 0000000000..cc8bf85af0
--- /dev/null
+++ b/gtestsuite/testsuite/level1/copyv/copyv.h
@@ -0,0 +1,112 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *             y := x
+ *          or y : = conj(x) BLIS_TYPED only
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation (BLIS API specific)
+ * @param[in] n vector length of x and y
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[out] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ */
+
+template<typename T>
+static void copyv_(gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) {
+
+    if constexpr (std::is_same<T, float>::value)
+        scopy_( &n, x, &incx, y, &incy );
+    else if constexpr (std::is_same<T, double>::value)
+        dcopy_( &n, x, &incx, y, &incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        ccopy_( &n, x, &incx, y, &incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zcopy_( &n, x, &incx, y, &incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/copyv.h: Invalid typename in copyv_().");
+}
+
+template<typename T>
+static void cblas_copyv(gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) {
+
+    if constexpr (std::is_same<T, float>::value)
+      cblas_scopy( n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+      cblas_dcopy( n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+      cblas_ccopy( n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+      cblas_zcopy( n, x, incx, y, incy );
+    else
+      throw std::runtime_error("Error in testsuite/level1/copyv.h: Invalid typename in cblas_copyv().");
+}
+
+template<typename T>
+static void typed_copyv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy) {
+
+    conj_t conj_x;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conjx, &conj_x );
+    if constexpr (std::is_same<T, float>::value)
+        bli_scopyv( conj_x, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dcopyv( conj_x, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_ccopyv( conj_x, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zcopyv( conj_x, n, x, incx, y, incy );
+    else
+      throw std::runtime_error("Error in testsuite/level1/copyv.h: Invalid typename in typed_copyv().");
+}
+
+template<typename T>
+static void copyv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+#ifdef TEST_BLAS
+    copyv_<T>(n, x, incx, y, incy);
+#elif TEST_CBLAS
+    cblas_copyv<T>(n, x, incx, y, incy);
+#elif TEST_BLIS_TYPED
+    typed_copyv<T>(conjx, n, x, incx, y, incy);
+#else
+    throw std::runtime_error("Error in testsuite/level1/copyv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp
new file mode 100644
index 0000000000..b97b992ba3
--- /dev/null
+++ b/gtestsuite/testsuite/level1/copyv/dcopyv_generic.cpp
@@ -0,0 +1,168 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_copyv.h"
+
+class dcopyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( dcopyvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether vec x is n,c
+    char conjx = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_copyv<T>(conjx, n, incx, incy, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class dcopyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conjx    = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        char datatype = std::get<4>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dcopy_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dcopy";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_dcopyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of scopy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dcopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dcopyvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED // BLIS-api specific
+// Test when conjugate of x is used as an argument.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        dcopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dcopyvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        dcopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // use x, not conj(x) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dcopyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        dcopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(-5), gtint_t(7)),                      // stride size for x
+            ::testing::Values(gtint_t(13), gtint_t(-9)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dcopyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp
new file mode 100644
index 0000000000..2035f92d60
--- /dev/null
+++ b/gtestsuite/testsuite/level1/copyv/scopyv_generic.cpp
@@ -0,0 +1,168 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_copyv.h"
+
+class scopyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( scopyvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether vec x is n,c
+    char conjx = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_copyv<T>(conjx, n, incx, incy, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class scopyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conjx    = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        char datatype = std::get<4>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "scopy_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_scopy";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_scopyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of scopyv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        scopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::scopyvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED // BLIS-api specific
+// Test when conjugate of x is used as an argument.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        scopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::scopyvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        scopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::scopyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        scopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(-5), gtint_t(7)),                      // stride size for x
+            ::testing::Values(gtint_t(13), gtint_t(-9)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::scopyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/copyv/test_copyv.h b/gtestsuite/testsuite/level1/copyv/test_copyv.h
new file mode 100644
index 0000000000..95f27925e2
--- /dev/null
+++ b/gtestsuite/testsuite/level1/copyv/test_copyv.h
@@ -0,0 +1,72 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "copyv.h"
+#include "level1/ref_copyv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for copyv operation.
+ */
+
+template<typename T>
+static void test_copyv( char conjx, gtint_t n, gtint_t incx, gtint_t incy,
+                                             double thresh, char datatype ) {
+
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y( testinghelpers::buff_dim(n, incy), T{-1} );
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+
+    testinghelpers::ref_copyv<T>(conjx, n, x.data(), incx, y_ref.data(), incy);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    copyv<T>(conjx, n, x.data(), incx, y.data(), incy);
+
+    //----------------------------------------------------------
+    //              Compute error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy );
+}
diff --git a/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp b/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp
new file mode 100644
index 0000000000..b76b11386e
--- /dev/null
+++ b/gtestsuite/testsuite/level1/copyv/zcopyv_generic.cpp
@@ -0,0 +1,158 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_copyv.h"
+
+class zcopyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( zcopyvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether vec x is n,c
+    char conjx = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_copyv<T>(conjx, n, incx, incy, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zcopyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conjx    = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        char datatype = std::get<4>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zcopy_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zcopy";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_zcopyv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of zcopy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zcopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'                                            // n: use x, c: use conj(x)
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zcopyvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        zcopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'                                            // n: use x, c: use conj(x)
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zcopyvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zcopyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(-5), gtint_t(7)),                      // stride size for x
+            ::testing::Values(gtint_t(13), gtint_t(-9)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zcopyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp
new file mode 100644
index 0000000000..3584be5f08
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotv/cdotv_generic.cpp
@@ -0,0 +1,174 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_dotv.h"
+
+class cdotvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( cdotvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether vec x is n,c
+    char conjx = std::get<0>(GetParam());
+    // denotes whether vec y is n,c
+    char conjy = std::get<1>(GetParam());
+    // vector length:
+    gtint_t n = std::get<2>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<3>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_dotv<T>(conjx, conjy, n, incx, incy, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class cdotvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conjx    = std::get<0>(str.param);
+        char conjy    = std::get<1>(str.param);
+        gtint_t n     = std::get<2>(str.param);
+        gtint_t incx  = std::get<3>(str.param);
+        gtint_t incy  = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cdotu_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cdotu_sub";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_cdotv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        str_name += "_" + std::string(&conjy, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of cdot.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use y, c: use conj(y)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cdotvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        cdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(11)),                                  // stride size for x
+            ::testing::Values(gtint_t(3)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cdotvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        cdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Values('n'),                                          // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(-2)),                                  // stride size for x
+            ::testing::Values(gtint_t(-3)),                                  // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cdotvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/dotv/ddotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/ddotv_generic.cpp
new file mode 100644
index 0000000000..250144e3f0
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotv/ddotv_generic.cpp
@@ -0,0 +1,177 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_dotv.h"
+
+class ddotvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( ddotvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether vec x is n,c
+    char conjx = std::get<0>(GetParam());
+    // denotes whether vec y is n,c
+    char conjy = std::get<1>(GetParam());
+    // vector length:
+    gtint_t n = std::get<2>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<3>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_dotv<T>(conjx, conjy, n, incx, incy, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class ddotvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conjx    = std::get<0>(str.param);
+        char conjy    = std::get<1>(str.param);
+        gtint_t n     = std::get<2>(str.param);
+        gtint_t incx  = std::get<3>(str.param);
+        gtint_t incy  = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ddot_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ddot";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_ddotv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        str_name += "_" + std::string(&conjy, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of sdot.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ddotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values('n'),                                          // n: use y, not conj(y) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ddotvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED // BLIS-api specific
+// Test when conjugate of x is used as an argument.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        ddotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values('c'),                                          // c: use conj(y)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ddotvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        ddotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // use x, not conj(x) (since it is real)
+            ::testing::Values('n'),                                          // use y, not conj(y) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ddotvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        ddotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Values('n'),                                          // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(-2)),                                  // stride size for x
+            ::testing::Values(gtint_t(-3)),                                  // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ddotvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/dotv/dotv.h b/gtestsuite/testsuite/level1/dotv/dotv.h
new file mode 100644
index 0000000000..dad9802345
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotv/dotv.h
@@ -0,0 +1,117 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *             rho := conjx(x)^T * conjy(y)
+ *          or rho := conjx(x)^T * conjy(y) (BLIS_TYPED only)
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation (BLIS API specific)
+ * @param[in] conjy denotes if y or conj(y) will be used for this operation (BLIS API specific)
+ * @param[in] n vector length of x and y
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[in, out] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ * @param[in,out] rho is a scalar
+ */
+
+template<typename T>
+static void dotv_(gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy, T* rho) {
+
+  if constexpr (std::is_same<T, float>::value)
+    *rho = sdot_( &n, x, &incx, y, &incy );
+  else if constexpr (std::is_same<T, double>::value)
+    *rho = ddot_( &n, x, &incx, y, &incy );
+  else if constexpr (std::is_same<T, scomplex>::value)
+    *rho = cdotu_( &n, x, &incx, y, &incy );
+  else if constexpr (std::is_same<T, dcomplex>::value)
+    *rho = zdotu_( &n, x, &incx, y, &incy );
+  else
+    throw std::runtime_error("Error in testsuite/level1/dotv.h: Invalid typename in dotv_().");
+}
+
+template<typename T>
+static void cblas_dotv(gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy, T* rho) {
+
+  if constexpr (std::is_same<T, float>::value)
+    *rho = cblas_sdot( n, x, incx, y, incy );
+  else if constexpr (std::is_same<T, double>::value)
+    *rho = cblas_ddot( n, x, incx, y, incy );
+  else if constexpr (std::is_same<T, scomplex>::value)
+    cblas_cdotu_sub( n, x, incx, y, incy, rho );
+  else if constexpr (std::is_same<T, dcomplex>::value)
+    cblas_zdotu_sub( n, x, incx, y, incy, rho );
+  else
+    throw std::runtime_error("Error in testsuite/level1/dotv.h: Invalid typename in cblas_dotv().");
+}
+
+template<typename T>
+static void typed_dotv(char conj_x, char conj_y, gtint_t n,
+  T* x, gtint_t incx, T* y, gtint_t incy, T* rho) {
+
+  conj_t conjx, conjy;
+  // Map parameter characters to BLIS constants.
+  testinghelpers::char_to_blis_conj( conj_x, &conjx );
+  testinghelpers::char_to_blis_conj( conj_y, &conjy );
+  if constexpr (std::is_same<T, float>::value)
+    bli_sdotv( conjx, conjy, n, x, incx, y, incy, rho );
+  else if constexpr (std::is_same<T, double>::value)
+    bli_ddotv( conjx, conjy, n, x, incx, y, incy, rho );
+  else if constexpr (std::is_same<T, scomplex>::value)
+    bli_cdotv( conjx, conjy, n, x, incx, y, incy, rho );
+  else if constexpr (std::is_same<T, dcomplex>::value)
+    bli_zdotv( conjx, conjy, n, x, incx, y, incy, rho );
+  else
+    throw std::runtime_error("Error in testsuite/level1/dotv.h: Invalid typename in typed_dotv().");
+}
+
+template<typename T>
+static void dotv(char conjx, char conjy, gtint_t n,
+  T* x, gtint_t incx, T* y, gtint_t incy, T* rho)
+{
+#ifdef TEST_BLAS
+    dotv_<T>(n, x, incx, y, incy, rho);
+#elif TEST_CBLAS
+    cblas_dotv<T>(n, x, incx, y, incy, rho);
+#elif TEST_BLIS_TYPED
+    typed_dotv<T>(conjx, conjy, n, x, incx, y, incy, rho);
+#else
+    throw std::runtime_error("Error in testsuite/level1/dotv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp
new file mode 100644
index 0000000000..ce57c4f59b
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotv/sdotv_generic.cpp
@@ -0,0 +1,177 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_dotv.h"
+
+class sdotvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( sdotvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether vec x is n,c
+    char conjx = std::get<0>(GetParam());
+    // denotes whether vec y is n,c
+    char conjy = std::get<1>(GetParam());
+    // vector length:
+    gtint_t n = std::get<2>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<3>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_dotv<T>(conjx, conjy, n, incx, incy, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class sdotvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conjx    = std::get<0>(str.param);
+        char conjy    = std::get<1>(str.param);
+        gtint_t n     = std::get<2>(str.param);
+        gtint_t incx  = std::get<3>(str.param);
+        gtint_t incy  = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "sdot_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_sdot";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_sdotv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        str_name += "_" + std::string(&conjy, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of sdotv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        sdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values('n'),                                          // n: use y, not conj(y) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sdotvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED // BLIS-api specific
+// Test when conjugate of x is used as an argument.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        sdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values('c'),                                          // c: use conj(y)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sdotvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        sdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values('n'),                                          // n: use y, not conj(y) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sdotvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        sdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Values('n'),                                          // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(-2)),                                  // stride size for x
+            ::testing::Values(gtint_t(-3)),                                  // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sdotvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/dotv/test_dotv.h b/gtestsuite/testsuite/level1/dotv/test_dotv.h
new file mode 100644
index 0000000000..1faf3120a2
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotv/test_dotv.h
@@ -0,0 +1,78 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "dotv.h"
+#include "level1/ref_dotv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for dotv operation.
+ */
+
+template<typename T>
+static void test_dotv( char conjx, char conjy, gtint_t n, gtint_t incx,
+    gtint_t incy, double thresh, char datatype )
+{
+
+
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-10, 10, n, incy, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+    T rho_ref;
+    if constexpr (testinghelpers::type_info<T>::is_real)
+        testinghelpers::ref_dotv<T>( n, x.data(), incx, y_ref.data(), incy, &rho_ref );
+    else
+        testinghelpers::ref_dotv<T>(conjx, conjy, n, x.data(), incx, y_ref.data(), incy, &rho_ref);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    T rho;
+    dotv<T>(conjx, conjy, n, x.data(), incx, y.data(), incy, &rho);
+
+    //----------------------------------------------------------
+    //              Compute error.
+    //----------------------------------------------------------
+    computediff<T>( rho, rho_ref, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp b/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp
new file mode 100644
index 0000000000..4b0f3fbcdb
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotv/zdotv_generic.cpp
@@ -0,0 +1,174 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_dotv.h"
+
+class zdotvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+// Tests using random integers as vector elements.
+TEST_P( zdotvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether vec x is n,c
+    char conjx = std::get<0>(GetParam());
+    // denotes whether vec y is n,c
+    char conjy = std::get<1>(GetParam());
+    // vector length:
+    gtint_t n = std::get<2>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<3>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_dotv<T>(conjx, conjy, n, incx, incy, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zdotvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conjx    = std::get<0>(str.param);
+        char conjy    = std::get<1>(str.param);
+        gtint_t n     = std::get<2>(str.param);
+        gtint_t incx  = std::get<3>(str.param);
+        gtint_t incy  = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zdotu_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zdotu_sub";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_zdotv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        str_name += "_" + std::string(&conjy, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of zdot.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+             ),                                                              // n: use y, c: use conj(y)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zdotvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        zdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zdotvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zdotvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Values('n'),                                          // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(-2)),                                  // stride size for x
+            ::testing::Values(gtint_t(-3)),                                  // stride size for y
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zdotvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp
new file mode 100644
index 0000000000..17377a7f0c
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotxv/cdotxv_generic.cpp
@@ -0,0 +1,166 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_dotxv.h"
+
+class cdotxvGenericTest :
+        public ::testing::TestWithParam<std::tuple<gtint_t, char, char, gtint_t, gtint_t, scomplex, scomplex, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(cdotxvGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( cdotxvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // denotes whether vec x is n,c
+    char conj_x = std::get<1>(GetParam());
+    // denotes whether vec y is n,c
+    char conj_y = std::get<2>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<3>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<4>(GetParam());
+    // alpha
+    T alpha = std::get<5>(GetParam());
+    // beta
+    T beta  = std::get<6>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<7>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_dotxv<T>(n, conj_x, conj_y, alpha, incx, incy, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class cdotxvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t,char,char,gtint_t,gtint_t,scomplex,scomplex,char>> str) const {
+        gtint_t n      = std::get<0>(str.param);
+        char conjx     = std::get<1>(str.param);
+        char conjy     = std::get<2>(str.param);
+        gtint_t incx   = std::get<3>(str.param);
+        gtint_t incy   = std::get<4>(str.param);
+        scomplex alpha = std::get<5>(str.param);
+        scomplex beta  = std::get<6>(str.param);
+        char datatype  = std::get<7>(str.param);
+        std::string str_name = "bli_cdotxv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        str_name += "_" + std::string(&conjy, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of cdotxv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cdotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Values('n', 'c'),                                     // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(scomplex{1.0, -1.0}),                          // alpha
+            ::testing::Values(scomplex{-1.0, 1.0}),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cdotxvGenericTestPrint()
+    );
+
+// Black box testing for generic and main use of cdotxv.
+INSTANTIATE_TEST_SUITE_P(
+        SmallSizesBlackbox,
+        cdotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(1), gtint_t(11), 1),                    // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Values('n', 'c'),                                     // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(scomplex{1.0, -1.0}),                          // alpha
+            ::testing::Values(scomplex{-1.0, 1.0}),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cdotxvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        cdotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Values('n', 'c'),                                     // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(scomplex{1.0, -1.0}),                          // alpha
+            ::testing::Values(scomplex{-1.0, 1.0}),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cdotxvGenericTestPrint()
+    );
+
+#endif
diff --git a/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp
new file mode 100644
index 0000000000..8cd33a861e
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotxv/ddotxv_generic.cpp
@@ -0,0 +1,165 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_dotxv.h"
+
+class ddotxvGenericTest :
+        public ::testing::TestWithParam<std::tuple<gtint_t, char, char, gtint_t, gtint_t, double, double, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ddotxvGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( ddotxvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // denotes whether vec x is n,c
+    char conj_x = std::get<1>(GetParam());
+    // denotes whether vec y is n,c
+    char conj_y = std::get<2>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<3>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<4>(GetParam());
+    // alpha
+    T alpha = std::get<5>(GetParam());
+    // beta
+    T beta  = std::get<6>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<7>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_dotxv<T>(n, conj_x, conj_y, alpha, incx, incy, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class ddotxvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t,char,char,gtint_t,gtint_t,double,double,char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        char conjx    = std::get<1>(str.param);
+        char conjy    = std::get<2>(str.param);
+        gtint_t incx  = std::get<3>(str.param);
+        gtint_t incy  = std::get<4>(str.param);
+        double alpha  = std::get<5>(str.param);
+        double beta   = std::get<6>(str.param);
+        char datatype = std::get<7>(str.param);
+        std::string str_name = "bli_ddotxv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        str_name += "_" + std::string(&conjy, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of ddotxv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ddotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values('n'),                                          // n: use y, not conj(y) (since it is real)
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(1.0, 2.0),                                     // alpha
+            ::testing::Values(2.0, 3.0),                                     // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ddotxvGenericTestPrint()
+    );
+
+// Test when conjugate of x is used as an argument.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        ddotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values('c'),                                          // use x, not conj(x) (since it is real)
+            ::testing::Values('c'),                                          // use y, not conj(y) (since it is real)
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(1.0, 2.0),                                     // alpha
+            ::testing::Values(2.0, 3.0),                                     // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ddotxvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        ddotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values('n'),                                          // use x, not conj(x) (since it is real)
+            ::testing::Values('n'),                                          // use y, not conj(y) (since it is real)
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(1.0, 2.0),                                     // alpha
+            ::testing::Values(2.0, 3.0),                                     // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ddotxvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/dotxv/dotxv.h b/gtestsuite/testsuite/level1/dotxv/dotxv.h
new file mode 100644
index 0000000000..3bb01ad0a0
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotxv/dotxv.h
@@ -0,0 +1,88 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *             rho := beta * rho + alpha * conjx(x)^T * conjy(y), BLIS_TYPED only
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation
+ * @param[in] conjy denotes if y or conj(y) will be used for this operation
+ * @param[in] n vector length of x and y
+ * @param[in] alpha scalar value
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[in] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ * @param[in] beta scalar value
+ * @param[in,out] rho are scalar
+ */
+
+template<typename T>
+static void typed_dotxv( char conj_x, char conj_y, gtint_t n, T* alpha,
+    T* x, gtint_t incx, T* y, gtint_t incy, T* beta, T* rho )
+{
+    conj_t conjx, conjy;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_x, &conjx );
+    testinghelpers::char_to_blis_conj( conj_y, &conjy );
+    if constexpr (std::is_same<T, float>::value)
+        bli_sdotxv( conjx, conjy, n, alpha, x, incx, y, incy, beta, rho );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_ddotxv( conjx, conjy, n, alpha, x, incx, y, incy, beta, rho );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cdotxv( conjx, conjy, n, alpha, x, incx, y, incy, beta, rho );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zdotxv( conjx, conjy, n, alpha, x, incx, y, incy, beta, rho );
+    else
+        throw std::runtime_error("Error in testsuite/level1/addv.h: Invalid typename in typed_addv().");
+}
+
+template<typename T>
+static void dotxv( char conjx, char conjy, gtint_t n, T* alpha,
+    T* x, gtint_t incx, T* y, gtint_t incy, T* beta, T* rho )
+{
+#ifdef TEST_BLAS
+    throw std::runtime_error("Error in testsuite/level1/dotxv.h: BLAS interface is not available.");
+#elif TEST_CBLAS
+    throw std::runtime_error("Error in testsuite/level1/dotxv.h: CBLAS interface is not available.");
+#elif TEST_BLIS_TYPED
+   typed_dotxv<T>( conjx, conjy, n, alpha, x, incx, y, incy, beta, rho );
+#else
+    throw std::runtime_error("Error in testsuite/level1/dotxv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp
new file mode 100644
index 0000000000..ea0ad22b6b
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotxv/sdotxv_generic.cpp
@@ -0,0 +1,165 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_dotxv.h"
+
+class sdotxvGenericTest :
+        public ::testing::TestWithParam<std::tuple<gtint_t, char, char, gtint_t, gtint_t, float, float, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(sdotxvGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( sdotxvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // denotes whether vec x is n,c
+    char conj_x = std::get<1>(GetParam());
+    // denotes whether vec y is n,c
+    char conj_y = std::get<2>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<3>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<4>(GetParam());
+    // alpha
+    T alpha = std::get<5>(GetParam());
+    // beta
+    T beta  = std::get<6>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<7>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_dotxv<T>(n, conj_x, conj_y, alpha, incx, incy, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class sdotxvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t,char,char,gtint_t,gtint_t,float,float,char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        char conjx    = std::get<1>(str.param);
+        char conjy    = std::get<2>(str.param);
+        gtint_t incx  = std::get<3>(str.param);
+        gtint_t incy  = std::get<4>(str.param);
+        float alpha   = std::get<5>(str.param);
+        float beta    = std::get<6>(str.param);
+        char datatype = std::get<7>(str.param);
+        std::string str_name = "bli_sdotxv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        str_name += "_" + std::string(&conjy, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of sdotxv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        sdotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values('n'),                                          // n: use y, not conj(y) (since it is real)
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(1.0, 2.0),                                     // alpha
+            ::testing::Values(2.0, 3.0),                                     // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sdotxvGenericTestPrint()
+    );
+
+// Test when conjugate of x is used as an argument.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        sdotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values('c'),                                          // c: use conj(y)
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(1.0, 2.0),                                     // alpha
+            ::testing::Values(2.0, 3.0),                                     // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sdotxvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        sdotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values('n'),                                          // n: use y, not conj(y) (since it is real)
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(1.0, 2.0),                                     // alpha
+            ::testing::Values(2.0, 3.0),                                     // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sdotxvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/dotxv/test_dotxv.h b/gtestsuite/testsuite/level1/dotxv/test_dotxv.h
new file mode 100644
index 0000000000..6d0f74d5f0
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotxv/test_dotxv.h
@@ -0,0 +1,75 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "dotxv.h"
+#include "level1/ref_dotxv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for dotxv operation.
+ */
+
+template<typename T>
+static void test_dotxv( gtint_t n, char conjx, char conjy, T alpha,
+  gtint_t incx, gtint_t incy, T beta, double thresh, char datatype )
+{
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-10, 10, n, incy, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+    T rho_ref;
+    testinghelpers::initone(rho_ref);
+    testinghelpers::ref_dotxv<T>(conjx, conjy, n, alpha, x.data(), incx, y.data(), incy, beta, &rho_ref);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    T rho;
+    testinghelpers::initone(rho);
+    dotxv(conjx, conjy, n, &alpha, x.data(), incx, y.data(), incy, &beta, &rho);
+
+    //----------------------------------------------------------
+    //              Compute error.
+    //----------------------------------------------------------
+    computediff<T>( rho, rho_ref, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp b/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp
new file mode 100644
index 0000000000..829532afde
--- /dev/null
+++ b/gtestsuite/testsuite/level1/dotxv/zdotxv_generic.cpp
@@ -0,0 +1,148 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_dotxv.h"
+
+class zdotxvGenericTest :
+        public ::testing::TestWithParam<std::tuple<gtint_t, char, char, gtint_t, gtint_t, dcomplex, dcomplex, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zdotxvGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( zdotxvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // denotes whether vec x is n,c
+    char conj_x = std::get<1>(GetParam());
+    // denotes whether vec y is n,c
+    char conj_y = std::get<2>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<3>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<4>(GetParam());
+    // alpha
+    T alpha = std::get<5>(GetParam());
+    // beta
+    T beta  = std::get<6>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<7>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_dotxv<T>(n, conj_x, conj_y, alpha, incx, incy, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zdotxvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t,char,char,gtint_t,gtint_t,dcomplex,dcomplex,char>> str) const {
+        gtint_t n      = std::get<0>(str.param);
+        char conjx     = std::get<1>(str.param);
+        char conjy     = std::get<2>(str.param);
+        gtint_t incx   = std::get<3>(str.param);
+        gtint_t incy   = std::get<4>(str.param);
+        dcomplex alpha = std::get<5>(str.param);
+        dcomplex beta  = std::get<6>(str.param);
+        char datatype  = std::get<7>(str.param);
+        std::string str_name = "bli_zdotxv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conjx, 1);
+        str_name += "_" + std::string(&conjy, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of zdotxv.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zdotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Values('n', 'c'),                                     // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(dcomplex{1.0, -1.0}),                          // alpha
+            ::testing::Values(dcomplex{-1.0, 1.0}),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zdotxvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        zdotxvGenericTest,
+        ::testing::Combine(
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Values('n', 'c'),                                     // n: use y, c: use conj(y)
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      // stride size for y
+            ::testing::Values(dcomplex{1.0, -1.0}),                          // alpha
+            ::testing::Values(dcomplex{-1.0, 1.0}),                          // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zdotxvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp
new file mode 100644
index 0000000000..d25419606f
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scal2v/cscal2v_generic.cpp
@@ -0,0 +1,138 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scal2v.h"
+
+class cscal2vGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(cscal2vGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( cscal2vGenericTest, RandomData )
+{
+  using T = scomplex;
+  //----------------------------------------------------------
+  // Initialize values from the parameters passed through
+  // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+  //----------------------------------------------------------
+  // denotes whether alpha or conj(alpha) will be used:
+  char conj_alpha = std::get<0>(GetParam());
+  // vector length:
+  gtint_t n = std::get<1>(GetParam());
+  // stride size for x:
+  gtint_t incx = std::get<2>(GetParam());
+  // stride size for y:
+  gtint_t incy = std::get<3>(GetParam());
+  // alpha
+  T alpha = std::get<4>(GetParam());
+  // specifies the datatype for randomgenerators
+  char datatype = std::get<5>(GetParam());
+
+  // Set the threshold for the errors:
+  double thresh = testinghelpers::getEpsilon<T>();
+  //----------------------------------------------------------
+  //     Call generic test body using those parameters
+  //----------------------------------------------------------
+  test_scal2v<T>(conj_alpha, n, incx, incy, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class cscal2vGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, gtint_t, gtint_t, gtint_t, scomplex, char>> str) const {
+        char conj = std::get<0>(str.param);
+        gtint_t n = std::get<1>(str.param);
+        gtint_t incx = std::get<2>(str.param);
+        gtint_t incy = std::get<3>(str.param);
+        scomplex alpha = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+        std::string str_name = "bli_cscal2v";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of cscal2.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cscal2vGenericTestPrint()
+    );
+
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        cscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(4)),                                   // stride size for y
+            ::testing::Values(scomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cscal2vGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp
new file mode 100644
index 0000000000..396bf99ba1
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scal2v/dscal2v_generic.cpp
@@ -0,0 +1,153 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scal2v.h"
+
+class dscal2vGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dscal2vGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( dscal2vGenericTest, RandomData )
+{
+  using T = double;
+  //----------------------------------------------------------
+  // Initialize values from the parameters passed through
+  // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+  //----------------------------------------------------------
+  // denotes whether alpha or conj(alpha) will be used:
+  char conj_alpha = std::get<0>(GetParam());
+  // vector length:
+  gtint_t n = std::get<1>(GetParam());
+  // stride size for x:
+  gtint_t incx = std::get<2>(GetParam());
+  // stride size for y:
+  gtint_t incy = std::get<3>(GetParam());
+  // alpha
+  T alpha = std::get<4>(GetParam());
+  // specifies the datatype for randomgenerators
+  char datatype = std::get<5>(GetParam());
+
+  // Set the threshold for the errors:
+  float thresh = testinghelpers::getEpsilon<T>();
+  //----------------------------------------------------------
+  //     Call generic test body using those parameters
+  //----------------------------------------------------------
+  test_scal2v<T>(conj_alpha, n, incx, incy, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class dscal2vGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, gtint_t, gtint_t, gtint_t, double, char>> str) const {
+        char conj = std::get<0>(str.param);
+        gtint_t n = std::get<1>(str.param);
+        gtint_t incx = std::get<2>(str.param);
+        gtint_t incy = std::get<3>(str.param);
+        double alpha = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+        std::string str_name = "bli_dscal2v";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of dscal2.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(double(2.0), double(-3.0)),                    // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dscal2vGenericTestPrint()
+    );
+
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        Conjalpha,
+        dscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conjugate
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(double(-3.0)),                                 // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dscal2vGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        dscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(5)),                                   // stride size for y
+            ::testing::Values(double(3.0)),                                  // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dscal2vGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/scal2v/scal2v.h b/gtestsuite/testsuite/level1/scal2v/scal2v.h
new file mode 100644
index 0000000000..ad1383b712
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scal2v/scal2v.h
@@ -0,0 +1,83 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *             y := alpha * conj(x) (for BLIS interface only)
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation
+ * @param[in] n vector length of x and y
+ * @param[in] alpha scalar
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[in,out] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ */
+
+template<typename T>
+static void typed_scal2v(char conj_x, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+    conj_t conjx;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_x, &conjx );
+    if constexpr (std::is_same<T, float>::value)
+        bli_sscal2v( conjx, n, &alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dscal2v( conjx, n, &alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cscal2v( conjx, n, &alpha, x, incx, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zscal2v( conjx, n, &alpha, x, incx, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/scal2v.h: Invalid typename in typed_scal2v().");
+}
+
+
+template<typename T>
+static void scal2v(char conjx, gtint_t n, T alpha, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+#ifdef TEST_BLAS
+    throw std::runtime_error("Error in testsuite/level1/scal2v.h: BLAS interface is not available.");
+#elif TEST_CBLAS
+    throw std::runtime_error("Error in testsuite/level1/scal2v.h: BLAS interface is not available.");
+#elif TEST_BLIS_TYPED
+    typed_scal2v<T>( conjx, n, alpha, x, incx, y, incy );
+#else
+    throw std::runtime_error("Error in testsuite/level1/scal2v.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp
new file mode 100644
index 0000000000..ef02a4c225
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scal2v/sscal2v_generic.cpp
@@ -0,0 +1,153 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scal2v.h"
+
+class sscal2vGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(sscal2vGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( sscal2vGenericTest, RandomData )
+{
+  using T = float;
+  //----------------------------------------------------------
+  // Initialize values from the parameters passed through
+  // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+  //----------------------------------------------------------
+  // denotes whether alpha or conj(alpha) will be used:
+  char conj_alpha = std::get<0>(GetParam());
+  // vector length:
+  gtint_t n = std::get<1>(GetParam());
+  // stride size for x:
+  gtint_t incx = std::get<2>(GetParam());
+  // stride size for y:
+  gtint_t incy = std::get<3>(GetParam());
+  // alpha
+  T alpha = std::get<4>(GetParam());
+  // specifies the datatype for randomgenerators
+  char datatype = std::get<5>(GetParam());
+
+  // Set the threshold for the errors:
+  float thresh = testinghelpers::getEpsilon<T>();
+  //----------------------------------------------------------
+  //     Call generic test body using those parameters
+  //----------------------------------------------------------
+  test_scal2v<T>(conj_alpha, n, incx, incy, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class sscal2vGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, gtint_t, gtint_t, gtint_t, float, char>> str) const {
+        char conj = std::get<0>(str.param);
+        gtint_t n = std::get<1>(str.param);
+        gtint_t incx = std::get<2>(str.param);
+        gtint_t incy = std::get<3>(str.param);
+        float alpha = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+        std::string str_name = "bli_sscal2v";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of sscal2.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        sscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(float(3.0), float(-5.0)),                      // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sscal2vGenericTestPrint()
+    );
+
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        Conjalpha,
+        sscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conjugate
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(float(9.0)),                                   // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sscal2vGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        sscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'), // n: use x
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                       // stride size for x
+            ::testing::Values(gtint_t(7)),                                  // stride size for y
+            ::testing::Values(float(2.0)),                              // alpha
+            ::testing::Values(ELEMENT_TYPE)                                          // i : integer, f : float  datatype type tested
+        ),
+        ::sscal2vGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/scal2v/test_scal2v.h b/gtestsuite/testsuite/level1/scal2v/test_scal2v.h
new file mode 100644
index 0000000000..8edb967ab2
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scal2v/test_scal2v.h
@@ -0,0 +1,70 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "scal2v.h"
+#include "level1/ref_scal2v.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for axpby operation.
+ */
+
+template<typename T>
+static void test_scal2v(char conjx, gtint_t n, gtint_t incx, gtint_t incy, T alpha, double thresh, char datatype)
+{
+    //----------------------------------------------------------
+    //        Initialize vector with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y( testinghelpers::buff_dim(n, incy), T{-112} );
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+    testinghelpers::ref_scal2v<T>(conjx, n, alpha, x.data(), incx, y_ref.data(), incy);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    scal2v<T>(conjx, n, alpha, x.data(), incx, y.data(), incy);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp b/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp
new file mode 100644
index 0000000000..0308cbd10b
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scal2v/zscal2v_generic.cpp
@@ -0,0 +1,139 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scal2v.h"
+
+class zscal2vGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zscal2vGenericTest);
+
+
+// Tests using random integers as vector elements.
+TEST_P( zscal2vGenericTest, RandomData )
+{
+  using T = dcomplex;
+  //----------------------------------------------------------
+  // Initialize values from the parameters passed through
+  // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+  //----------------------------------------------------------
+  // denotes whether alpha or conj(alpha) will be used:
+  char conj_alpha = std::get<0>(GetParam());
+  // vector length:
+  gtint_t n = std::get<1>(GetParam());
+  // stride size for x:
+  gtint_t incx = std::get<2>(GetParam());
+  // stride size for y:
+  gtint_t incy = std::get<3>(GetParam());
+  // alpha
+  T alpha = std::get<4>(GetParam());
+  // specifies the datatype for randomgenerators
+  char datatype = std::get<5>(GetParam());
+
+  // Set the threshold for the errors:
+  float thresh = testinghelpers::getEpsilon<T>();
+  //----------------------------------------------------------
+  //     Call generic test body using those parameters
+  //----------------------------------------------------------
+  test_scal2v<T>(conj_alpha, n, incx, incy, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zscal2vGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, gtint_t, gtint_t, gtint_t, dcomplex, char>> str) const {
+        char conj = std::get<0>(str.param);
+        gtint_t n = std::get<1>(str.param);
+        gtint_t incx = std::get<2>(str.param);
+        gtint_t incy = std::get<3>(str.param);
+        dcomplex alpha = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+        std::string str_name = "bli_zscal2v";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of cscal2.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(dcomplex{3.0, -2.0}, dcomplex{-1.0, 4.0}),     // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zscal2vGenericTestPrint()
+    );
+
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        zscal2vGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      // stride size for x
+            ::testing::Values(gtint_t(3)),                                   // stride size for y
+            ::testing::Values(dcomplex{1.0, 2.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zscal2vGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp
new file mode 100644
index 0000000000..223fec91d7
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scalv/cscalv_generic.cpp
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scalv.h"
+
+class cscalvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   char>> {};
+
+
+// Tests using random integers as vector elements.
+TEST_P( cscalvGenericTest, RandomData )
+{
+  using T = scomplex;
+  //----------------------------------------------------------
+  // Initialize values from the parameters passed through
+  // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+  //----------------------------------------------------------
+  // denotes whether alpha or conj(alpha) will be used:
+  char conj_alpha = std::get<0>(GetParam());
+  // vector length:
+  gtint_t n = std::get<1>(GetParam());
+  // stride size for x:
+  gtint_t incx = std::get<2>(GetParam());
+  // alpha
+  T alpha = std::get<3>(GetParam());
+  // specifies the datatype for randomgenerators
+  char datatype = std::get<4>(GetParam());
+
+  // Set the threshold for the errors:
+  double thresh = testinghelpers::getEpsilon<T>();
+  //----------------------------------------------------------
+  //     Call generic test body using those parameters
+  //----------------------------------------------------------
+  test_scalv<T>(conj_alpha, n, incx, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class cscalvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, gtint_t, gtint_t, scomplex, char>> str) const {
+        char conj = std::get<0>(str.param);
+        gtint_t n = std::get<1>(str.param);
+        gtint_t incx = std::get<2>(str.param);
+        scomplex alpha = std::get<3>(str.param);
+        char datatype = std::get<4>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cscal_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cscal";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_cscalv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of cscal.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cscalvGenericTestPrint()
+    );
+
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        cscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      //(gtint_t(-5), gtint_t(-17)) // stride size for x
+            ::testing::Values(scomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cscalvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        cscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(scomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cscalvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp
new file mode 100644
index 0000000000..6410481560
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scalv/dscalv_generic.cpp
@@ -0,0 +1,168 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scalv.h"
+
+class dscalvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   char>> {};
+
+
+// Tests using random integers as vector elements.
+TEST_P( dscalvGenericTest, RandomData )
+{
+  using T = double;
+  //----------------------------------------------------------
+  // Initialize values from the parameters passed through
+  // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+  //----------------------------------------------------------
+  // denotes whether alpha or conj(alpha) will be used:
+  char conj_alpha = std::get<0>(GetParam());
+  // vector length:
+  gtint_t n = std::get<1>(GetParam());
+  // stride size for x:
+  gtint_t incx = std::get<2>(GetParam());
+  // alpha
+  T alpha = std::get<3>(GetParam());
+  // specifies the datatype for randomgenerators
+  char datatype = std::get<4>(GetParam());
+
+  // Set the threshold for the errors:
+  double thresh = testinghelpers::getEpsilon<T>();
+  //----------------------------------------------------------
+  //     Call generic test body using those parameters
+  //----------------------------------------------------------
+  test_scalv<T>(conj_alpha, n, incx, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class dscalvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, gtint_t, gtint_t, double, char>> str) const {
+        char conj = std::get<0>(str.param);
+        gtint_t n = std::get<1>(str.param);
+        gtint_t incx = std::get<2>(str.param);
+        double alpha = std::get<3>(str.param);
+        char datatype = std::get<4>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dscal_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dscal";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_dscalv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of dscal.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(double(2.0), double(-3.0)),                    // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dscalvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        Conjalpha,
+        dscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conjugate
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(double(-3.0)),                                 // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dscalvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        dscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      //(gtint_t(-5), gtint_t(-17)) // stride size for x
+            ::testing::Values(double(3.0)),                                  // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dscalvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        dscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(3),                                             // alpha
+            ::testing::Values(ELEMENT_TYPE)                                   // i : integer, f : float  datatype type tested
+        ),
+        ::dscalvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/scalv/scalv.h b/gtestsuite/testsuite/level1/scalv/scalv.h
new file mode 100644
index 0000000000..0ae0125f52
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scalv/scalv.h
@@ -0,0 +1,112 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *             y := alpha * y
+ *          or y := conj(alpha) * y (for BLIS interface only)
+ * @param[in] conjalpha denotes if alpha or conj(alpha) will be used for this operation
+ * @param[in] n vector length of x and y
+ * @param[in] alpha scalar
+ * @param[in,out] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ */
+
+template<typename T>
+static void scalv_(gtint_t n, T alpha, T* x, gtint_t incx)
+{
+    if constexpr (std::is_same<T, float>::value)
+        sscal_( &n, &alpha, x, &incx );
+    else if constexpr (std::is_same<T, double>::value)
+        dscal_( &n, &alpha, x, &incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cscal_( &n, &alpha, x, &incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zscal_( &n, &alpha, x, &incx );
+    else
+        throw std::runtime_error("Error in testsuite/level1/scalv.h: Invalid typename in scalv_().");
+}
+
+template<typename T>
+static void cblas_scalv(gtint_t n, T alpha, T* x, gtint_t incx)
+{
+    if constexpr (std::is_same<T, float>::value)
+        cblas_sscal( n, alpha, x, incx );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dscal( n, alpha, x, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_cscal( n, &alpha, x, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zscal( n, &alpha, x, incx );
+    else
+        throw std::runtime_error("Error in testsuite/level1/scalv.h: Invalid typename in cblas_scalv().");
+}
+
+template<typename T>
+static void typed_scalv(char conj_alpha, gtint_t n, T alpha, T* x, gtint_t incx)
+{
+    conj_t conjalpha;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_alpha, &conjalpha );
+    if constexpr (std::is_same<T, float>::value)
+        bli_sscalv( conjalpha, n, &alpha, x, incx );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dscalv( conjalpha, n, &alpha, x, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cscalv( conjalpha, n, &alpha, x, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zscalv( conjalpha, n, &alpha, x, incx );
+    else
+        throw std::runtime_error("Error in testsuite/level1/scalv.h: Invalid typename in typed_scalv().");
+}
+
+
+template<typename T>
+static void scalv(char conj_alpha, gtint_t n, T alpha, T* x, gtint_t incx)
+{
+#ifdef TEST_BLAS
+    scalv_<T>( n, alpha, x, incx );
+#elif TEST_CBLAS
+    cblas_scalv<T>( n, alpha, x, incx );
+#elif TEST_BLIS_TYPED
+    typed_scalv<T>( conj_alpha, n, alpha, x, incx );
+#else
+    throw std::runtime_error("Error in testsuite/level1/scalv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp
new file mode 100644
index 0000000000..df350f91b5
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scalv/scalv_extreme_cases.cpp
@@ -0,0 +1,90 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scalv.h"
+
+template <typename T>
+class xscalv : public ::testing::Test {};
+typedef ::testing::Types<float, double> TypeParam;
+TYPED_TEST_SUITE(xscalv, TypeParam);
+
+TYPED_TEST(xscalv, zero_alpha_x_fp)
+{
+    using T = TypeParam;
+    gtint_t n = 10, incx = 1;
+    std::vector<T> x(n);
+    // Initialize x with random numbers.
+    testinghelpers::datagenerators::randomgenerators(n, incx, x.data(), 'f');
+    std::vector<T> x_ref(x);
+    T alpha = T{0};
+
+    testinghelpers::ref_scalv<T>('n', n, alpha, x_ref.data(), incx);
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    scalv<T>('n', n, alpha, x.data(), incx);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+    computediff<T>( n, x.data(), x_ref.data(), incx, thresh );
+}
+
+TYPED_TEST(xscalv, zero_alpha_x_inf)
+{
+    using T = TypeParam;
+    gtint_t n = 10, incx = 1;
+    std::vector<T> x(n);
+    // Initialize x with random numbers.
+    testinghelpers::datagenerators::randomgenerators(n, incx, x.data(), 'f');
+    x[3] = 1.0/0.0;
+    std::vector<T> x_ref(x);
+    T alpha = T{0};
+    testinghelpers::ref_scalv<T>('n', n, alpha, x_ref.data(), incx);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    scalv<T>('n', n, alpha, x.data(), incx);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+    computediff<T>( n, x.data(), x_ref.data(), incx, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp
new file mode 100644
index 0000000000..7e37a0e8fc
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scalv/sscalv_generic.cpp
@@ -0,0 +1,169 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scalv.h"
+
+class sscalvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   char>> {};
+
+
+// Tests using random integers as vector elements.
+TEST_P( sscalvGenericTest, RandomData )
+{
+  using T = float;
+  //----------------------------------------------------------
+  // Initialize values from the parameters passed through
+  // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+  //----------------------------------------------------------
+  // denotes whether alpha or conj(alpha) will be used:
+  char conj_alpha = std::get<0>(GetParam());
+  // vector length:
+  gtint_t n = std::get<1>(GetParam());
+  // stride size for x:
+  gtint_t incx = std::get<2>(GetParam());
+  // alpha
+  T alpha = std::get<3>(GetParam());
+  // specifies the datatype for randomgenerators
+  char datatype = std::get<4>(GetParam());
+
+  // Set the threshold for the errors:
+  double thresh = testinghelpers::getEpsilon<T>();
+  //----------------------------------------------------------
+  //     Call generic test body using those parameters
+  //----------------------------------------------------------
+  test_scalv<T>(conj_alpha, n, incx, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class sscalvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, gtint_t, gtint_t, float, char>> str) const {
+        char conj = std::get<0>(str.param);
+        gtint_t n = std::get<1>(str.param);
+        gtint_t incx = std::get<2>(str.param);
+        float alpha = std::get<3>(str.param);
+        char datatype = std::get<4>(str.param);
+ #ifdef TEST_BLAS
+        std::string str_name = "sscal_";
+ #elif TEST_CBLAS
+        std::string str_name = "cblas_sscal";
+ #else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_sscalv";
+ #endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of sscal.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        sscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(float(3.0), float(-5.0)),                      // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sscalvGenericTestPrint()
+    );
+
+#ifdef TEST_BLIS_TYPED
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        Conjalpha,
+        sscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conjugate
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(float(9.0)),                                   // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sscalvGenericTestPrint()
+    );
+#endif
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        sscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      //(gtint_t(-5), gtint_t(-17)) // stride size for x
+            ::testing::Values(float(2.0)),                                   // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sscalvGenericTestPrint()
+    );
+
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        sscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(3),                                            // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sscalvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/scalv/test_scalv.h b/gtestsuite/testsuite/level1/scalv/test_scalv.h
new file mode 100644
index 0000000000..bfe7f9bfde
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scalv/test_scalv.h
@@ -0,0 +1,69 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "scalv.h"
+#include "level1/ref_scalv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for axpby operation.
+ */
+
+template<typename T>
+static void test_scalv(char conja_alpha, gtint_t n, gtint_t incx, T alpha, double thresh, char datatype)
+{
+    //----------------------------------------------------------
+    //        Initialize vector with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> x_ref(x);
+    testinghelpers::ref_scalv<T>(conja_alpha, n, alpha, x_ref.data(), incx);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    scalv<T>(conja_alpha, n, alpha, x.data(), incx);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, x.data(), x_ref.data(), incx, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp
new file mode 100644
index 0000000000..6ddf2489d9
--- /dev/null
+++ b/gtestsuite/testsuite/level1/scalv/zscalv_generic.cpp
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_scalv.h"
+
+class zscalvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   char>> {};
+
+
+// Tests using random integers as vector elements.
+TEST_P( zscalvGenericTest, RandomData )
+{
+  using T = dcomplex;
+  //----------------------------------------------------------
+  // Initialize values from the parameters passed through
+  // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+  //----------------------------------------------------------
+  // denotes whether alpha or conj(alpha) will be used:
+  char conj_alpha = std::get<0>(GetParam());
+  // vector length:
+  gtint_t n = std::get<1>(GetParam());
+  // stride size for x:
+  gtint_t incx = std::get<2>(GetParam());
+  // alpha
+  T alpha = std::get<3>(GetParam());
+  // specifies the datatype for randomgenerators
+  char datatype = std::get<4>(GetParam());
+
+  // Set the threshold for the errors:
+  double thresh = testinghelpers::getEpsilon<T>();
+  //----------------------------------------------------------
+  //     Call generic test body using those parameters
+  //----------------------------------------------------------
+  test_scalv<T>(conj_alpha, n, incx, alpha, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zscalvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, gtint_t, gtint_t, dcomplex, char>> str) const {
+        char conj = std::get<0>(str.param);
+        gtint_t n = std::get<1>(str.param);
+        gtint_t incx = std::get<2>(str.param);
+        dcomplex alpha = std::get<3>(str.param);
+        char datatype = std::get<4>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zscal_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zscal";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_zscalv";
+#endif
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing for generic and main use of cscal.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(dcomplex{3.0, -2.0}, dcomplex{-1.0, 4.0}),     // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zscalvGenericTestPrint()
+    );
+
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitPositiveIncrements,
+        zscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'
+#ifdef TEST_BLIS_TYPED
+            , 'c'                                                            // this option is BLIS-api specific.
+#endif
+            ),                                                               // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      //(gtint_t(-5), gtint_t(-17)) // stride size for x
+            ::testing::Values(dcomplex{1.0, 2.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zscalvGenericTestPrint()
+    );
+
+#ifndef TEST_BLIS_TYPED
+// Test for negative increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NegativeIncrements,
+        zscalvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(-2), gtint_t(-1)),                     // stride size for x
+            ::testing::Values(dcomplex{4.0, 3.1}),                           // alpha
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zscalvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/setv/csetv_generic.cpp b/gtestsuite/testsuite/level1/setv/csetv_generic.cpp
new file mode 100644
index 0000000000..2a2daf72fd
--- /dev/null
+++ b/gtestsuite/testsuite/level1/setv/csetv_generic.cpp
@@ -0,0 +1,93 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_setv.h"
+
+class csetvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(csetvGenericTest);
+
+TEST_P( csetvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether alpha or conjalpha
+    char conjalpha = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+
+    T alpha = {1.2, 2.0};
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_setv<T>( conjalpha, n, alpha, incx );
+}
+
+// Prints the test case combination
+class csetvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        std::string str_name = "bli_csetv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        csetvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: not transpose for x, c: conjugate for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1))                                    // stride size for x
+        ),
+        ::csetvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp b/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp
new file mode 100644
index 0000000000..6051169bbc
--- /dev/null
+++ b/gtestsuite/testsuite/level1/setv/dsetv_generic.cpp
@@ -0,0 +1,93 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_setv.h"
+
+class dsetvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dsetvGenericTest);
+
+TEST_P( dsetvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether alpha or conjalpha
+    char conjalpha = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+
+    T alpha = {1.2};
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_setv<T>( conjalpha, n, alpha, incx );
+}
+
+// Prints the test case combination
+class dsetvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        std::string str_name = "bli_dsetv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dsetvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: not transpose for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1))                                    // stride size for x
+        ),
+        ::dsetvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/setv/setv.h b/gtestsuite/testsuite/level1/setv/setv.h
new file mode 100644
index 0000000000..651ec36b90
--- /dev/null
+++ b/gtestsuite/testsuite/level1/setv/setv.h
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation
+ *             x := conjalpha(alpha) (BLIS_TYPED only)
+ * @param[in] conjalpha denotes if alpha or conj(alpha) will be used for this operation
+ * @param[in] n vector length of x
+ * @param[in] alpha value to set in vector x.
+ * @param[in,out] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ */
+
+template<typename T>
+static void typed_setv(char conjalpha, gtint_t n, T* alpha, T* x, gtint_t incx)
+{
+    conj_t conjx;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conjalpha, &conjx );
+    if constexpr (std::is_same<T, float>::value)
+        bli_ssetv( conjx, n, alpha, x, incx );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dsetv( conjx, n, alpha, x, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_csetv( conjx, n, alpha, x, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zsetv( conjx, n, alpha, x, incx );
+    else
+        throw std::runtime_error("Error in testsuite/level1/setv.h: Invalid typename in typed_setv().");
+}
+
+template<typename T>
+static void setv(char conjalpha, gtint_t n, T* alpha, T* x, gtint_t incx)
+{
+#ifdef TEST_BLAS
+    throw std::runtime_error("Error in testsuite/level1/setv.h: BLAS interface is not available.");
+#elif TEST_CBLAS
+    throw std::runtime_error("Error in testsuite/level1/setv.h: CBLAS interface is not available.");
+#elif TEST_BLIS_TYPED
+    typed_setv(conjalpha, n, alpha, x, incx);
+#else
+    throw std::runtime_error("Error in testsuite/level1/setv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp b/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp
new file mode 100644
index 0000000000..2590619ea2
--- /dev/null
+++ b/gtestsuite/testsuite/level1/setv/ssetv_generic.cpp
@@ -0,0 +1,93 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_setv.h"
+
+class ssetvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ssetvGenericTest);
+
+TEST_P( ssetvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether alpha or conjalpha
+    char conjalpha = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+
+    T alpha = {1.2};
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_setv<T>( conjalpha, n, alpha, incx );
+}
+
+// Prints the test case combination
+class ssetvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        std::string str_name = "bli_ssetv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ssetvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: not transpose for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1))                                    // stride size for x
+        ),
+        ::ssetvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/setv/test_setv.h b/gtestsuite/testsuite/level1/setv/test_setv.h
new file mode 100644
index 0000000000..09bd121f6e
--- /dev/null
+++ b/gtestsuite/testsuite/level1/setv/test_setv.h
@@ -0,0 +1,74 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "setv.h"
+#include "common/testing_helpers.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for setv operation.
+ */
+
+template<typename T>
+void test_setv( char conjalpha, gtint_t n, T alpha, gtint_t incx ) {
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x( testinghelpers::buff_dim(n, incx), T{-1} );
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    T alpha_ref = alpha;
+    if( testinghelpers::chkconj( conjalpha ) ) {
+        alpha_ref = testinghelpers::conj<T>( alpha );
+    }
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    setv( conjalpha, n, &alpha, x.data(), incx );
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    gtint_t i,idx;
+    for( idx = 0 ; idx < n ; idx++ )
+    {
+        i = (incx > 0) ? (idx * incx) : ( - ( n - idx - 1 ) * incx );
+        EXPECT_EQ(x[i], alpha_ref) << "blis_sol[" << i << "]="<< x[i] <<"   ref = "  << alpha_ref;
+    }
+}
diff --git a/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp b/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp
new file mode 100644
index 0000000000..d12271612f
--- /dev/null
+++ b/gtestsuite/testsuite/level1/setv/zsetv_generic.cpp
@@ -0,0 +1,93 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_setv.h"
+
+class zsetvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zsetvGenericTest);
+
+TEST_P( zsetvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether alpha or conjalpha
+    char conjalpha = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+
+    T alpha = {1.2, 2.0};
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_setv<T>( conjalpha, n, alpha, incx );
+}
+
+// Prints the test case combination
+class zsetvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        std::string str_name = "bli_zsetv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zsetvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: not transpose for x, c: conjugate for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1))                                    // stride size for x
+        ),
+        ::zsetvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/subv/csubv_generic.cpp b/gtestsuite/testsuite/level1/subv/csubv_generic.cpp
new file mode 100644
index 0000000000..7b98a8ebfb
--- /dev/null
+++ b/gtestsuite/testsuite/level1/subv/csubv_generic.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_subv.h"
+
+class csubvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t, gtint_t, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(csubvGenericTest);
+
+TEST_P( csubvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_subv<T>(conj_x, n, incx, incy, thresh, datatype);
+}
+
+// Prints the test case combination
+class csubvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        char datatype  = std::get<4>(str.param);
+        std::string str_name = "bli_csubv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        csubvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: not transpose for x, c: conjugate for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1), gtint_t(4)),                       // stride size for x
+            ::testing::Values(gtint_t(1), gtint_t(7)),                       // stride size for y
+            ::testing::Values(ELEMENT_TYPE,'f')                              // i : integer, f : float  datatype type tested
+        ),
+        ::csubvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp b/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp
new file mode 100644
index 0000000000..9b31bcb102
--- /dev/null
+++ b/gtestsuite/testsuite/level1/subv/dsubv_generic.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_subv.h"
+
+class dsubvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t, gtint_t, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dsubvGenericTest);
+
+TEST_P( dsubvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_subv<T>(conj_x, n, incx, incy, thresh, datatype);
+}
+
+// Prints the test case combination
+class dsubvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        char datatype  = std::get<4>(str.param);
+        std::string str_name = "bli_dsubv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dsubvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: not transpose for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1), gtint_t(4)),                       // stride size for x
+            ::testing::Values(gtint_t(1), gtint_t(7)),                       // stride size for y
+            ::testing::Values(ELEMENT_TYPE,'f')                              // i : integer, f : float  datatype type tested
+        ),
+        ::dsubvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp b/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp
new file mode 100644
index 0000000000..4d96efc4e1
--- /dev/null
+++ b/gtestsuite/testsuite/level1/subv/ssubv_generic.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_subv.h"
+
+class ssubvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t, gtint_t, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ssubvGenericTest);
+
+TEST_P( ssubvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_subv<T>(conj_x, n, incx, incy, thresh, datatype);
+}
+
+// Prints the test case combination
+class ssubvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        char datatype  = std::get<4>(str.param);
+        std::string str_name = "bli_ssubv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ssubvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: not transpose for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1), gtint_t(4)),                       // stride size for x
+            ::testing::Values(gtint_t(1), gtint_t(7)),                       // stride size for y
+            ::testing::Values(ELEMENT_TYPE,'f')                              // i : integer, f : float  datatype type tested
+        ),
+        ::ssubvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/subv/subv.h b/gtestsuite/testsuite/level1/subv/subv.h
new file mode 100644
index 0000000000..ff5059d6ff
--- /dev/null
+++ b/gtestsuite/testsuite/level1/subv/subv.h
@@ -0,0 +1,81 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation
+ *             y := y - x or y := y - conj(x) (BLIS_TYPED only)
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation
+ * @param[in] n vector length of x and y
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[in, out] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ */
+
+template<typename T>
+static void typed_subv(char conj_x, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+    conj_t conjx;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_x, &conjx );
+    if constexpr (std::is_same<T, float>::value)
+        bli_ssubv( conjx, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dsubv( conjx, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_csubv( conjx, n, x, incx, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zsubv( conjx, n, x, incx, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/subv.h: Invalid typename in typed_subv().");
+}
+
+template<typename T>
+static void subv(char conjx, gtint_t n, T* x, gtint_t incx, T* y, gtint_t incy)
+{
+#ifdef TEST_BLAS
+    throw std::runtime_error("Error in testsuite/level1/subv.h: BLAS interface is not available.");
+#elif TEST_CBLAS
+    throw std::runtime_error("Error in testsuite/level1/subv.h: CBLAS interface is not available.");
+#elif TEST_BLIS_TYPED
+    typed_subv(conjx, n, x, incx, y, incy);
+#else
+    throw std::runtime_error("Error in testsuite/level1/subv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/subv/test_subv.h b/gtestsuite/testsuite/level1/subv/test_subv.h
new file mode 100644
index 0000000000..9406823bd3
--- /dev/null
+++ b/gtestsuite/testsuite/level1/subv/test_subv.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "subv.h"
+#include "level1/ref_subv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for subv operation.
+ */
+
+template<typename T>
+void test_subv( char conjx, gtint_t n, gtint_t incx, gtint_t incy,
+               double thresh, char datatype ) {
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-10, 10, n, incy, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+    testinghelpers::ref_subv<T>(conjx, n, x.data(), incx, y_ref.data(), incy);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    subv(conjx, n, x.data(), incx, y.data(), incy);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy, thresh );
+
+}
diff --git a/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp b/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp
new file mode 100644
index 0000000000..2fa7236e64
--- /dev/null
+++ b/gtestsuite/testsuite/level1/subv/zsubv_generic.cpp
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_subv.h"
+
+class zsubvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char, gtint_t, gtint_t, gtint_t, char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zsubvGenericTest);
+
+TEST_P( zsubvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<4>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_subv<T>(conj_x, n, incx, incy, thresh, datatype);
+}
+
+// Prints the test case combination
+class zsubvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char conj      = std::get<0>(str.param);
+        gtint_t n      = std::get<1>(str.param);
+        gtint_t incx   = std::get<2>(str.param);
+        gtint_t incy   = std::get<3>(str.param);
+        char datatype  = std::get<4>(str.param);
+        std::string str_name = "bli_zsubv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zsubvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n','c'),                                      // n: not transpose for x, c: conjugate for x
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1), gtint_t(4)),                       // stride size for x
+            ::testing::Values(gtint_t(1), gtint_t(7)),                       // stride size for y
+            ::testing::Values(ELEMENT_TYPE,'f')                              // i : integer, f : float  datatype type tested
+        ),
+        ::zsubvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/xpbyv/cxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/cxpbyv_generic.cpp
new file mode 100644
index 0000000000..7af0647138
--- /dev/null
+++ b/gtestsuite/testsuite/level1/xpbyv/cxpbyv_generic.cpp
@@ -0,0 +1,138 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_xpbyv.h"
+
+class cxpbyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(cxpbyvGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( cxpbyvGenericTest, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // beta
+    T beta = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*testinghelpers::getEpsilon<T>();
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_xpbyv<T>(conj_x, n, incx, incy, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class cxpbyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,scomplex,char>> str) const {
+        char conj     = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        scomplex beta = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+        std::string str_name = "bli_cxpbyv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                  beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of cxpby.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cxpbyvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        cxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      /*(gtint_t(-5), gtint_t(-17))*/  // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      /*(gtint_t(-12), gtint_t(-4))*/  // stride size for y
+            ::testing::Values(scomplex{4.0, 3.1}),                           // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cxpbyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp
new file mode 100644
index 0000000000..15e06808c0
--- /dev/null
+++ b/gtestsuite/testsuite/level1/xpbyv/dxpbyv_generic.cpp
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_xpbyv.h"
+
+class dxpbyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dxpbyvGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( dxpbyvGenericTest, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // beta
+    T beta = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_xpbyv<T>(conj_x, n, incx, incy, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class dxpbyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,double,char>> str) const {
+        char conj     = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        double beta   = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+        std::string str_name = "bli_dxpbyv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of caxpy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(double(2.0), double(-2.0)),                    // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dxpbyvGenericTestPrint()
+    );
+
+
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        dxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(double(2.0)),                                  // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dxpbyvGenericTestPrint()
+    );
+
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        dxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      /*(gtint_t(-5), gtint_t(-17))*/// stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      /*(gtint_t(-12), gtint_t(-4))*/// stride size for y
+            ::testing::Values(double(4.0)),                                  // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dxpbyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp
new file mode 100644
index 0000000000..b424025ce7
--- /dev/null
+++ b/gtestsuite/testsuite/level1/xpbyv/sxpbyv_generic.cpp
@@ -0,0 +1,156 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_xpbyv.h"
+
+class sxpbyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(sxpbyvGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( sxpbyvGenericTest, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // beta
+    T beta = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    float thresh = 2*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_xpbyv<T>(conj_x, n, incx, incy, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class sxpbyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,float,char>> str) const {
+        char conj     = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        float beta    = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+        std::string str_name = "bli_sxpbyv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of caxpy.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        sxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(float(2.0), float(-2.0)),                      // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sxpbyvGenericTestPrint()
+    );
+
+// Test when conjugate of x is used as an argument. This option is BLIS-api specific.
+// Only test very few cases as sanity check since conj(x) = x for real types.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        ConjX,
+        sxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('c'),                                          // c: use conj(x)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(float(2.0)),                                   // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sxpbyvGenericTestPrint()
+    );
+
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        sxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n'),                                          // n: use x, not conj(x) (since it is real)
+            ::testing::Values(gtint_t(3), gtint_t(30), gtint_t(112)),        // m size of vector
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      /*(gtint_t(-5), gtint_t(-17))*/// stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      /*(gtint_t(-12), gtint_t(-4))*/// stride size for y
+            ::testing::Values(float(4.0)),                                   // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sxpbyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h b/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h
new file mode 100644
index 0000000000..46af04c30e
--- /dev/null
+++ b/gtestsuite/testsuite/level1/xpbyv/test_xpbyv.h
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "xpbyv.h"
+#include "level1/ref_xpbyv.h"
+#include "inc/check_error.h"
+
+/**
+ * @brief Generic test body for axpby operation.
+ */
+
+template<typename T>
+static void test_xpbyv(char conjx, gtint_t n, gtint_t incx, gtint_t incy,
+    T beta, double thresh, char datatype ) {
+
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-10, 10, n, incy, datatype);
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    std::vector<T> y_ref(y);
+    testinghelpers::ref_xpbyv<T>(conjx, n, x.data(), incx, beta, y_ref.data(), incy);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    xpbyv<T>(conjx, n, x.data(), incx, beta, y.data(), incy);
+
+    //----------------------------------------------------------
+    //              Compute component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy, thresh );
+}
diff --git a/gtestsuite/testsuite/level1/xpbyv/xpbyv.h b/gtestsuite/testsuite/level1/xpbyv/xpbyv.h
new file mode 100644
index 0000000000..2b3a15fbd5
--- /dev/null
+++ b/gtestsuite/testsuite/level1/xpbyv/xpbyv.h
@@ -0,0 +1,82 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *             y := beta * y + x or y := beta * y + conj(x) (BLIS_TYPED only)
+ * @param[in] conjx denotes if x or conj(x) will be used for this operation
+ * @param[in] n vector length of x and y
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @param[in] beta scalar
+ * @param[in, out] y pointer which points to the first element of y
+ * @param[in] incy increment of y
+ */
+
+template<typename T>
+static void typed_xpbyv(char conj_x, gtint_t n, T* x, gtint_t incx, T beta, T* y, gtint_t incy)
+{
+    conj_t conjx;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_x, &conjx );
+    if constexpr (std::is_same<T, float>::value)
+        bli_sxpbyv( conjx, n, x, incx, &beta, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dxpbyv( conjx, n, x, incx, &beta, y, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cxpbyv( conjx, n, x, incx, &beta, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zxpbyv( conjx, n, x, incx, &beta, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level1/xpbyv.h: Invalid typename in typed_xpbyv().");
+}
+
+template<typename T>
+static void xpbyv(char conj_x, gtint_t n, T* x, gtint_t incx, T beta, T* y, gtint_t incy)
+{
+#ifdef TEST_BLAS
+    throw std::runtime_error("Error in testsuite/level1/xpbyv.h: BLAS interface is not available.");
+#elif TEST_CBLAS
+    throw std::runtime_error("Error in testsuite/level1/xpbyv.h: CBLAS interface is not available.");
+#elif TEST_BLIS_TYPED
+    typed_xpbyv<T>( conj_x, n, x, incx, beta, y, incy );
+#else
+    throw std::runtime_error("Error in testsuite/level1/xpbyv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp b/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp
new file mode 100644
index 0000000000..cea3e8a086
--- /dev/null
+++ b/gtestsuite/testsuite/level1/xpbyv/zxpbyv_generic.cpp
@@ -0,0 +1,138 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_xpbyv.h"
+
+class zxpbyvGenericTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zxpbyvGenericTest);
+
+// Tests using random integers as vector elements.
+TEST_P( zxpbyvGenericTest, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // denotes whether x or conj(x) will be added to y:
+    char conj_x = std::get<0>(GetParam());
+    // vector length:
+    gtint_t n = std::get<1>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<2>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<3>(GetParam());
+    // beta
+    T beta = std::get<4>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<5>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*testinghelpers::getEpsilon<T>();
+    //----------------------------------------------------------
+    //     Call generic test body using those parameters
+    //----------------------------------------------------------
+    test_xpbyv<T>(conj_x, n, incx, incy, beta, thresh, datatype);
+}
+
+// Used to generate a test case with a sensible name.
+// Beware that we cannot use fp numbers (e.g., 2.3) in the names,
+// so we are only printing int(2.3). This should be enough for debugging purposes.
+// If this poses an issue, please reach out.
+class zxpbyvGenericTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,gtint_t,gtint_t,gtint_t,dcomplex,char>> str) const {
+        char conj     = std::get<0>(str.param);
+        gtint_t n     = std::get<1>(str.param);
+        gtint_t incx  = std::get<2>(str.param);
+        gtint_t incy  = std::get<3>(str.param);
+        dcomplex beta = std::get<4>(str.param);
+        char datatype = std::get<5>(str.param);
+        std::string str_name = "bli_zxpbyv";
+        str_name += "_" + std::to_string(n);
+        str_name += "_" + std::string(&conj, 1);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name += "_" + incx_str;
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name += "_" + incy_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                  beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing for generic and main use of zaxpby.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1)),                                   /*(gtint_t(-5), gtint_t(-17))*/  // stride size for x
+            ::testing::Values(gtint_t(1)),                                   /*(gtint_t(-12), gtint_t(-4))*/  // stride size for y
+            ::testing::Values(dcomplex{2.0, -1.0}, dcomplex{-2.0, 3.0}),     // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zxpbyvGenericTestPrint()
+    );
+
+// Test for non-unit increments.
+// Only test very few cases as sanity check.
+// We can modify the values using implementantion details.
+INSTANTIATE_TEST_SUITE_P(
+        NonUnitIncrements,
+        zxpbyvGenericTest,
+        ::testing::Combine(
+            ::testing::Values('n', 'c'),                                     // n: use x, c: use conj(x)
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(2), gtint_t(11)),                      /*(gtint_t(-5), gtint_t(-17))*/  // stride size for x
+            ::testing::Values(gtint_t(3), gtint_t(33)),                      /*(gtint_t(-12), gtint_t(-4))*/  // stride size for y
+            ::testing::Values(dcomplex{4.0, 3.1}),                           // beta
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zxpbyvGenericTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp
new file mode 100644
index 0000000000..8c0cb5200a
--- /dev/null
+++ b/gtestsuite/testsuite/level2/gemv/cgemv_generic.cpp
@@ -0,0 +1,156 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemv.h"
+
+class cgemvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(cgemvTest, RandomData) {
+    using T = scomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<7>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<8>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemv<T>(storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, datatype);
+}
+
+class cgemvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,scomplex,scomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char transa    = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        gtint_t m      = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        scomplex alpha = std::get<5>(str.param);
+        scomplex beta  = std::get<6>(str.param);
+        gtint_t incx   = std::get<7>(str.param);
+        gtint_t incy   = std::get<8>(str.param);
+        gtint_t ld_inc = std::get<9>(str.param);
+        char datatype  = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cgemv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cgemv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_cgemv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + transa+conjx;
+        str_name    = str_name + "_" + std::to_string(m);
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_b" + beta_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cgemvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{1.0, -2.0}),                          // alpha
+            ::testing::Values(scomplex{-1.0, 1.0}),                          // beta
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cgemvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp
new file mode 100644
index 0000000000..4fc91b1f46
--- /dev/null
+++ b/gtestsuite/testsuite/level2/gemv/dgemv_generic.cpp
@@ -0,0 +1,154 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemv.h"
+
+class dgemvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dgemvTest, RandomData) {
+    using T = double;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<7>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<8>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemv<T>(storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, datatype);
+}
+
+class dgemvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,double,double,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char transa    = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        gtint_t m      = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        double alpha   = std::get<5>(str.param);
+        double beta    = std::get<6>(str.param);
+        gtint_t incx   = std::get<7>(str.param);
+        gtint_t incy   = std::get<8>(str.param);
+        gtint_t ld_inc = std::get<9>(str.param);
+        char datatype  = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dgemv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dgemv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dgemv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + transa+conjx;
+        str_name    = str_name + "_" + std::to_string(m);
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_b" + beta_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dgemvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0 ),                                        // alpha
+            ::testing::Values(-1.0 ),                                        // beta
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dgemvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/gemv/gemv.h b/gtestsuite/testsuite/level2/gemv/gemv.h
new file mode 100644
index 0000000000..d6cc12f2db
--- /dev/null
+++ b/gtestsuite/testsuite/level2/gemv/gemv.h
@@ -0,0 +1,150 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *   y := alpha*A*x + beta*y,   or   y := alpha*A**T*x + beta*y,   or
+ *   y := alpha*A**H*x + beta*y,
+ *
+ * or y := beta * y + alpha * transa(A) * conjx(x) (BLIS_TYPED only)
+ *
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     conjx  specifies the form of xp to be used in
+                         the vector multiplication (BLIS_TYPED only)
+ * @param[in]     m      specifies  the number  of rows  of the  matrix A
+ * @param[in]     n      specifies the number  of columns of the matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ * @param[in]     xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] yp     specifies pointer which points to the first element of yp
+ * @param[in]     incy   specifies storage spacing between elements of yp.
+ */
+
+template<typename T>
+static void gemv_( char transa, gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+  T* xp, gtint_t incx, T* beta, T* yp, gtint_t incy )
+{
+    if constexpr (std::is_same<T, float>::value)
+        sgemv_( &transa, &m, &n, alpha, ap, &lda, xp, &incx, beta, yp, &incy );
+    else if constexpr (std::is_same<T, double>::value)
+        dgemv_( &transa, &m, &n, alpha, ap, &lda, xp, &incx, beta, yp, &incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cgemv_( &transa, &m, &n, alpha, ap, &lda, xp, &incx, beta, yp, &incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zgemv_( &transa, &m, &n, alpha, ap, &lda, xp, &incx, beta, yp, &incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/gemv.h: Invalid typename in gemv_().");
+}
+
+template<typename T>
+static void cblas_gemv( char storage, char trans, gtint_t m, gtint_t n, T* alpha,
+    T* ap, gtint_t lda,  T* xp, gtint_t incx, T* beta, T* yp, gtint_t incy )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_TRANSPOSE cblas_trans;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_trans( trans, &cblas_trans );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_sgemv( cblas_order, cblas_trans, m, n, *alpha, ap, lda, xp, incx, *beta, yp, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dgemv( cblas_order, cblas_trans, m, n, *alpha, ap, lda, xp, incx, *beta, yp, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_cgemv( cblas_order, cblas_trans, m, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zgemv( cblas_order, cblas_trans, m, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/gemv.h: Invalid typename in cblas_gemv().");
+}
+
+template<typename T>
+static void typed_gemv(char storage, char trans, char conj_x,
+    gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* xp, gtint_t incx, T* beta, T* yp, gtint_t incy)
+{
+    trans_t transa;
+    conj_t  conjx;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_trans( trans, &transa );
+    testinghelpers::char_to_blis_conj ( conj_x, &conjx );
+
+    dim_t rsa,csa;
+
+    rsa=csa=1;
+    /* a = m x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_sgemv( transa, conjx, m, n, alpha, ap, rsa, csa, xp, incx, beta, yp, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dgemv( transa, conjx, m, n, alpha, ap, rsa, csa, xp, incx, beta, yp, incy );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cgemv( transa, conjx, m, n, alpha, ap, rsa, csa, xp, incx, beta, yp, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zgemv( transa, conjx, m, n, alpha, ap, rsa, csa, xp, incx, beta, yp, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/gemv.h: Invalid typename in typed_gemv().");
+}
+
+template<typename T>
+static void gemv( char storage, char trans, char conj_x, gtint_t m, gtint_t n,
+    T* alpha, T* ap, gtint_t lda, T* xp, gtint_t incx, T* beta, T* yp, gtint_t incy )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        gemv_<T>( trans, m, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/gemv.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_gemv<T>( storage, trans, m, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+#elif TEST_BLIS_TYPED
+    typed_gemv<T>( storage, trans, conj_x, m, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+#else
+    throw std::runtime_error("Error in testsuite/level2/gemv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp
new file mode 100644
index 0000000000..a6906559eb
--- /dev/null
+++ b/gtestsuite/testsuite/level2/gemv/sgemv_generic.cpp
@@ -0,0 +1,154 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemv.h"
+
+class sgemvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(sgemvTest, RandomData) {
+    using T = float;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<7>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<8>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemv<T>(storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, datatype);
+}
+
+class sgemvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,float,float,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char transa    = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        gtint_t m      = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        float alpha    = std::get<5>(str.param);
+        float beta     = std::get<6>(str.param);
+        gtint_t incx   = std::get<7>(str.param);
+        gtint_t incy   = std::get<8>(str.param);
+        gtint_t ld_inc = std::get<9>(str.param);
+        char datatype  = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "sgemv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_sgemv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_sgemv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + transa+conjx;
+        str_name    = str_name + "_" + std::to_string(m);
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_b" + beta_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        sgemvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0 ),                                        // alpha
+            ::testing::Values(-1.0 ),                                        // beta
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0)),           // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sgemvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/gemv/test_gemv.h b/gtestsuite/testsuite/level2/gemv/test_gemv.h
new file mode 100644
index 0000000000..7d3dfc14d6
--- /dev/null
+++ b/gtestsuite/testsuite/level2/gemv/test_gemv.h
@@ -0,0 +1,81 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "gemv.h"
+#include "level2/ref_gemv.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+
+void test_gemv( char storage, char trnsa, char conjx, gtint_t m, gtint_t n,
+    T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy,
+    double thresh, char datatype ) {
+
+    // Compute the leading dimensions for matrix size calculation.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', m, n, lda_inc);
+
+    // Get correct vector lengths.
+    gtint_t lenx = ( testinghelpers::chknotrans( trnsa ) ) ? n : m ;
+    gtint_t leny = ( testinghelpers::chknotrans( trnsa ) ) ? m : n ;
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(1, 5, storage, 'n', m, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(1, 3, lenx, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(1, 3, leny, incy, datatype);
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> y_ref(y);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    gemv( storage, trnsa, conjx, m, n, &alpha, a.data(), lda,
+                         x.data(), incx, &beta, y.data(), incy );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_gemv( storage, trnsa, conjx, m, n, alpha, a.data(),
+                         lda, x.data(), incx, beta, y_ref.data(), incy );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( leny, y.data(), y_ref.data(), incy, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp b/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp
new file mode 100644
index 0000000000..74d95b5b13
--- /dev/null
+++ b/gtestsuite/testsuite/level2/gemv/zgemv_generic.cpp
@@ -0,0 +1,156 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemv.h"
+
+class zgemvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zgemvTest, RandomData) {
+    using T = dcomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<7>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<8>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemv<T>(storage, transa, conjx, m, n, alpha, lda_inc, incx, beta, incy, thresh, datatype);
+}
+
+class zgemvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,dcomplex,dcomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char transa    = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        gtint_t m      = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        dcomplex alpha = std::get<5>(str.param);
+        dcomplex beta  = std::get<6>(str.param);
+        gtint_t incx   = std::get<7>(str.param);
+        gtint_t incy   = std::get<8>(str.param);
+        gtint_t ld_inc = std::get<9>(str.param);
+        char datatype  = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zgemv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zgemv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zgemv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + transa+conjx;
+        str_name    = str_name + "_" + std::to_string(m);
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_b" + beta_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zgemvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{1.0, -2.0}),                          // alpha
+            ::testing::Values(dcomplex{-1.0, 1.0}),                          // beta
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zgemvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/ger/cger_generic.cpp b/gtestsuite/testsuite/level2/ger/cger_generic.cpp
new file mode 100644
index 0000000000..7dcd4fea70
--- /dev/null
+++ b/gtestsuite/testsuite/level2/ger/cger_generic.cpp
@@ -0,0 +1,148 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_ger.h"
+
+class cgerTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(cgerTest, RandomData) {
+    using T = scomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<1>(GetParam());
+    // denotes whether vector y is n,c
+    char conjy = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<7>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_ger<T>(storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh, datatype);
+}
+
+class cgerTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,scomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char conjx     = std::get<1>(str.param);
+        char conjy     = std::get<2>(str.param);
+        gtint_t m      = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        scomplex alpha = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t incy   = std::get<7>(str.param);
+        gtint_t ld_inc = std::get<8>(str.param);
+        char datatype  = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cger_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cger";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_cger";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + conjx+conjy;
+        str_name    = str_name + "_" + std::to_string(m);
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cgerTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Values('n','c'),                                      // conjy
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{1.0, -2.0}),                          // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cgerTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/ger/dger_generic.cpp b/gtestsuite/testsuite/level2/ger/dger_generic.cpp
new file mode 100644
index 0000000000..043a165407
--- /dev/null
+++ b/gtestsuite/testsuite/level2/ger/dger_generic.cpp
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_ger.h"
+
+class dgerTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dgerTest, RandomData) {
+    using T = double;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<1>(GetParam());
+    // denotes whether vector y is n,c
+    char conjy = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<7>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_ger<T>(storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh, datatype);
+}
+
+class dgerTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,double,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char conjx     = std::get<1>(str.param);
+        char conjy     = std::get<2>(str.param);
+        gtint_t m      = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        double alpha   = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t incy   = std::get<7>(str.param);
+        gtint_t ld_inc = std::get<8>(str.param);
+        char datatype  = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dger_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dger";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dger";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + conjx+conjy;
+        str_name    = str_name + "_" + std::to_string(m);
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dgerTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Values('n'),                                          // conjy
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0 ),                                        // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dgerTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/ger/ger.h b/gtestsuite/testsuite/level2/ger/ger.h
new file mode 100644
index 0000000000..c6747f6c7a
--- /dev/null
+++ b/gtestsuite/testsuite/level2/ger/ger.h
@@ -0,0 +1,158 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *          A := alpha*x*y**T + A,
+ *       or A := A + alpha * conjx(x) * conjy(y)^T (BLIS_TYPED only)
+ * @param[in]     conjy  specifies the form of xp to be used in
+                         the vector multiplication (BLIS_TYPED only)
+ * @param[in]     conjy  specifies the form of yp to be used in
+                         the vector multiplication (BLIS_TYPED only)
+ * @param[in]     m      specifies  the number  of rows  of the  matrix A
+ * @param[in]     n      specifies the number  of columns of the matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+ * @param[in]     yp     specifies pointer which points to the first element of yp
+ * @param[in]     incy   specifies storage spacing between elements of yp.
+ * @param[in,out] ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ */
+
+template<typename T>
+static void ger_( char conjy, gtint_t m, gtint_t n, T* alpha,
+    T* xp, gtint_t incx, T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+    if constexpr (std::is_same<T, float>::value)
+        sger_( &m, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+    else if constexpr (std::is_same<T, double>::value)
+        dger_( &m, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+    else if constexpr (std::is_same<T, scomplex>::value) {
+      if( testinghelpers::chkconj( conjy ) )
+        cgerc_( &m, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+      else
+        cgeru_( &m, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+    }
+    else if constexpr (std::is_same<T, dcomplex>::value) {
+      if( testinghelpers::chkconj( conjy ) )
+        zgerc_( &m, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+      else
+        zgeru_( &m, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+    }
+    else
+        throw std::runtime_error("Error in testsuite/level2/ger.h: Invalid typename in ger_().");
+}
+
+template<typename T>
+static void cblas_ger( char storage, char conjy, gtint_t m, gtint_t n,
+    T* alpha, T* xp, gtint_t incx,T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_sger( cblas_order, m, n, *alpha, xp, incx, yp, incy, ap, lda );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dger( cblas_order, m, n, *alpha, xp, incx, yp, incy, ap, lda );
+    else if constexpr (std::is_same<T, scomplex>::value) {
+      if( testinghelpers::chkconj( conjy ) )
+        cblas_cgerc( cblas_order, m, n, alpha, xp, incx, yp, incy, ap, lda );
+      else
+        cblas_cgeru( cblas_order, m, n, alpha, xp, incx, yp, incy, ap, lda );
+    }
+    else if constexpr (std::is_same<T, dcomplex>::value) {
+      if( testinghelpers::chkconj( conjy ) )
+        cblas_zgerc( cblas_order, m, n, alpha, xp, incx, yp, incy, ap, lda );
+      else
+        cblas_zgeru( cblas_order, m, n, alpha, xp, incx, yp, incy, ap, lda );
+    }
+    else
+        throw std::runtime_error("Error in testsuite/level2/ger.h: Invalid typename in cblas_ger().");
+}
+
+template<typename T>
+static void typed_ger(char storage, char conj_x, char conj_y, gtint_t m, gtint_t n,
+         T* alpha, T* xp, gtint_t incx, T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+    conj_t  conjx;
+    conj_t  conjy;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj ( conj_x, &conjx );
+    testinghelpers::char_to_blis_conj ( conj_y, &conjy );
+
+    dim_t rsa,csa;
+
+    rsa=csa=1;
+    /* a = m x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_sger( conjx, conjy, m, n, alpha, xp, incx, yp, incy, ap, rsa, csa );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dger( conjx, conjy, m, n, alpha, xp, incx, yp, incy, ap, rsa, csa );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cger( conjx, conjy, m, n, alpha, xp, incx, yp, incy, ap, rsa, csa );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zger( conjx, conjy, m, n, alpha, xp, incx, yp, incy, ap, rsa, csa );
+    else
+        throw std::runtime_error("Error in testsuite/level2/ger.h: Invalid typename in typed_ger().");
+}
+
+template<typename T>
+static void ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n,
+    T* alpha, T* xp, gtint_t incx, T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        ger_<T>( conjy, m, n, alpha, xp, incx, yp, incy, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/ger.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_ger<T>( storage, conjy, m, n, alpha, xp, incx, yp, incy, ap, lda );
+#elif TEST_BLIS_TYPED
+    typed_ger<T>( storage, conjx, conjy, m, n, alpha, xp, incx, yp, incy, ap, lda );
+#else
+    throw std::runtime_error("Error in testsuite/level2/ger.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/ger/sger_generic.cpp b/gtestsuite/testsuite/level2/ger/sger_generic.cpp
new file mode 100644
index 0000000000..113dee0342
--- /dev/null
+++ b/gtestsuite/testsuite/level2/ger/sger_generic.cpp
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_ger.h"
+
+class sgerTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(sgerTest, RandomData) {
+    using T = float;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<1>(GetParam());
+    // denotes whether vector y is n,c
+    char conjy = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<7>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 4*std::max(m,n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_ger<T>(storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh, datatype);
+}
+
+class sgerTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,float,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char conjx     = std::get<1>(str.param);
+        char conjy     = std::get<2>(str.param);
+        gtint_t m      = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        float alpha    = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t incy   = std::get<7>(str.param);
+        gtint_t ld_inc = std::get<8>(str.param);
+        char datatype  = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "sger_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_sger";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_sger";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + conjx+conjy;
+        str_name    = str_name + "_" + std::to_string(m);
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        sgerTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Values('n'),                                          // conjy
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0 ),                                        // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sgerTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/ger/test_ger.h b/gtestsuite/testsuite/level2/ger/test_ger.h
new file mode 100644
index 0000000000..a85a13a7e9
--- /dev/null
+++ b/gtestsuite/testsuite/level2/ger/test_ger.h
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "ger.h"
+#include "level2/ref_ger.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+
+void test_ger( char storage, char conjx, char conjy, gtint_t m, gtint_t n,
+    T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh,
+    char datatype ) {
+
+    // Compute the leading dimensions for matrix size calculation.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', m, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 5, storage, 'n', m, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-3, 3, m, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-3, 3, n, incy, datatype);
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> a_ref(a);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    ger( storage, conjx, conjy, m, n, &alpha, x.data(), incx,
+                                              y.data(), incy, a.data(), lda );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_ger( storage, conjx, conjy, m, n, alpha,
+                          x.data(), incx, y.data(), incy, a_ref.data(), lda );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, n, a.data(), a_ref.data(), lda, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/ger/zger_generic.cpp b/gtestsuite/testsuite/level2/ger/zger_generic.cpp
new file mode 100644
index 0000000000..0f32161eaa
--- /dev/null
+++ b/gtestsuite/testsuite/level2/ger/zger_generic.cpp
@@ -0,0 +1,148 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_ger.h"
+
+class zgerTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zgerTest, RandomData) {
+    using T = dcomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<1>(GetParam());
+    // denotes whether vector y is n,c
+    char conjy = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // stride size for y:
+    gtint_t incy = std::get<7>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*std::max(m,n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_ger<T>(storage, conjx, conjy, m, n, alpha, incx, incy, lda_inc, thresh, datatype);
+}
+
+class zgerTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,dcomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char conjx     = std::get<1>(str.param);
+        char conjy     = std::get<2>(str.param);
+        gtint_t m      = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        dcomplex alpha = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t incy   = std::get<7>(str.param);
+        gtint_t ld_inc = std::get<8>(str.param);
+        char datatype  = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zger_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zger";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zger";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + conjx+conjy;
+        str_name    = str_name + "_" + std::to_string(m);
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zgerTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Values('n','c'),                                      // conjy
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{1.0, -2.0}),                          // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zgerTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp b/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp
new file mode 100644
index 0000000000..ed650d0229
--- /dev/null
+++ b/gtestsuite/testsuite/level2/hemv/chemv_generic.cpp
@@ -0,0 +1,155 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_hemv.h"
+
+class chemvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(chemvTest, RandomData) {
+    using T = scomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<2>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<7>(GetParam());
+    // stride size for y
+    gtint_t incy = std::get<8>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 4*std::sqrt(n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_hemv<T>(storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh, datatype);
+}
+
+class chemvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,scomplex,scomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conja     = std::get<2>(str.param);
+        char conjx     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        scomplex alpha = std::get<5>(str.param);
+        scomplex beta  = std::get<6>(str.param);
+        gtint_t incx   = std::get<7>(str.param);
+        gtint_t incy   = std::get<8>(str.param);
+        gtint_t ld_inc = std::get<9>(str.param);
+        char datatype  = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "chemv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_chemv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_chemv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conja+conjx;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_a" + beta_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        chemvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conja
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{1.0, -2.0}),                          // alpha
+            ::testing::Values(scomplex{2.0, -1.0}),                          // beta
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::chemvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/hemv/hemv.h b/gtestsuite/testsuite/level2/hemv/hemv.h
new file mode 100644
index 0000000000..90086336a7
--- /dev/null
+++ b/gtestsuite/testsuite/level2/hemv/hemv.h
@@ -0,0 +1,138 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        y := alpha*A*x + beta*y
+ *     or y := beta * y + alpha * conja(A) * conjx(x)
+ * @param[in]     conja  specifies the form of A to be used in
+                         the matrix-vector multiplication (BLIS_TYPED only)
+ * @param[in]     conjx  specifies the form of xp to be used in
+                         the matrix-vector multiplication (BLIS_TYPED only)
+ * @param[in]     storage specifies the form of storage in the memory matrix A
+ * @param[in]     uploa  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     n      specifies the number  of rows  of the  matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ * @param[in]     xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] yp     specifies pointer which points to the first element of yp
+ * @param[in]     incy   specifies storage spacing between elements of yp.
+ */
+
+template<typename T>
+static void hemv_( char uploa, gtint_t n, T* alpha, T* ap, gtint_t lda,
+                    T* xp, gtint_t incx, T* beta, T* yp, gtint_t incy )
+{
+    if constexpr (std::is_same<T, scomplex>::value)
+        chemv_( &uploa, &n, alpha, ap, &lda, xp, &incx, beta, yp, &incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zhemv_( &uploa, &n, alpha, ap, &lda, xp, &incx, beta, yp, &incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/hemv.h: Invalid typename in hemv_().");
+}
+
+template<typename T>
+static void cblas_hemv( char storage, char uploa, gtint_t n, T* alpha,
+    T* ap, gtint_t lda, T* xp, gtint_t incx, T* beta, T* yp, gtint_t incy )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uplo );
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        cblas_chemv( cblas_order, cblas_uplo, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zhemv( cblas_order, cblas_uplo, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/hemv.h: Invalid typename in cblas_hemv().");
+}
+
+template<typename T>
+static void typed_hemv( char storage, char uplo, char conj_a, char conj_x,
+    gtint_t n, T* alpha, T* a, gtint_t lda, T* x, gtint_t incx, T* beta,
+    T* y, gtint_t incy )
+{
+    uplo_t uploa;
+    conj_t conja;
+    conj_t conjx;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+    testinghelpers::char_to_blis_conj ( conj_a, &conja );
+    testinghelpers::char_to_blis_conj ( conj_x, &conjx );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        bli_chemv( uploa, conja, conjx, n, alpha, a, rsa, csa, x, incx, beta, y, incy );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zhemv( uploa, conja, conjx, n, alpha, a, rsa, csa, x, incx, beta, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/hemv.h: Invalid typename in typed_hemv().");
+}
+
+template<typename T>
+static void hemv( char storage, char uploa, char conja, char conjx, gtint_t n,
+    T* alpha, T* ap, gtint_t lda, T* xp, gtint_t incx, T* beta, T* yp,
+    gtint_t incy )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        hemv_<T>( uploa, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/hemv.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_hemv<T>( storage, uploa, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+#elif TEST_BLIS_TYPED
+    typed_hemv<T>( storage, uploa, conja, conjx, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+#else
+    throw std::runtime_error("Error in testsuite/level2/hemv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/hemv/test_hemv.h b/gtestsuite/testsuite/level2/hemv/test_hemv.h
new file mode 100644
index 0000000000..8f8357e96e
--- /dev/null
+++ b/gtestsuite/testsuite/level2/hemv/test_hemv.h
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "hemv.h"
+#include "level2/ref_hemv.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_hemv( char storage, char uploa, char conja, char conjx, gtint_t n,
+    T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy,
+    double thresh, char datatype ) {
+
+    // Compute the leading dimensions of a.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 5, storage, 'n', n, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-3, 3, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-3, 3, n, incy, datatype);
+
+    mkherm<T>( storage, uploa, n, a.data(), lda );
+    mktrim<T>( storage, uploa, n, a.data(), lda );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> y_ref(y);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    hemv<T>( storage, uploa, conja, conjx, n, &alpha, a.data(), lda,
+                                  x.data(), incx, &beta, y.data(), incy );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_hemv<T>( storage, uploa, conja, conjx, n, &alpha,
+                 a.data(), lda, x.data(), incx, &beta, y_ref.data(), incy );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp b/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp
new file mode 100644
index 0000000000..1f60f25468
--- /dev/null
+++ b/gtestsuite/testsuite/level2/hemv/zhemv_generic.cpp
@@ -0,0 +1,155 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_hemv.h"
+
+class zhemvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zhemvTest, RandomData) {
+    using T = dcomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<2>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<7>(GetParam());
+    // stride size for y
+    gtint_t incy = std::get<8>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 8*std::sqrt(n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_hemv<T>(storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh, datatype);
+}
+
+class zhemvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,dcomplex,dcomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conja     = std::get<2>(str.param);
+        char conjx     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        dcomplex alpha = std::get<5>(str.param);
+        dcomplex beta  = std::get<6>(str.param);
+        gtint_t incx   = std::get<7>(str.param);
+        gtint_t incy   = std::get<8>(str.param);
+        gtint_t ld_inc = std::get<9>(str.param);
+        char datatype  = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zhemv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zhemv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zhemv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conja+conjx;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_a" + beta_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zhemvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conja
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{1.0, -2.0}),                          // alpha
+            ::testing::Values(dcomplex{2.0, -1.0}),                          // beta
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zhemvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/her/cher_generic.cpp b/gtestsuite/testsuite/level2/her/cher_generic.cpp
new file mode 100644
index 0000000000..2805f17f23
--- /dev/null
+++ b/gtestsuite/testsuite/level2/her/cher_generic.cpp
@@ -0,0 +1,134 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_her.h"
+
+class cherTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(cherTest, RandomData) {
+    using T = scomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<3>(GetParam());
+    // specifies alpha value
+    float alpha = std::get<4>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<5>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<6>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<7>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 4*std::sqrt(n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_her<T, float>(storage, uploa, conjx, n, alpha, incx, lda_inc, thresh, datatype);
+}
+
+class cherTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,float,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        gtint_t n      = std::get<3>(str.param);
+        float alpha    = std::get<4>(str.param);
+        gtint_t incx   = std::get<5>(str.param);
+        gtint_t ld_inc = std::get<6>(str.param);
+        char datatype  = std::get<7>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cher_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cher";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_cher";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conjx;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cherTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(1.0),                                          // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cherTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/her/her.h b/gtestsuite/testsuite/level2/her/her.h
new file mode 100644
index 0000000000..ea7d3008c7
--- /dev/null
+++ b/gtestsuite/testsuite/level2/her/her.h
@@ -0,0 +1,126 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *           A := alpha*x*x**H + A
+ * @param[in]     uploa  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     m      specifies  the number  of rows  of the  matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+ * @param[in,out] ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ */
+
+template<typename T, typename Tr>
+static void her_( char uploa, gtint_t n, Tr* alpha, T* xp, gtint_t incx,
+                                                  T* ap, gtint_t lda )
+{
+    if constexpr (std::is_same<T, scomplex>::value)
+        cher_( &uploa, &n, alpha, xp, &incx, ap, &lda );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zher_( &uploa, &n, alpha, xp, &incx, ap, &lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/her.h: Invalid typename in her_().");
+}
+
+template<typename T, typename Tr>
+static void cblas_her( char storage, char uploa, gtint_t n, Tr* alpha,
+                            T* xp, gtint_t incx, T* ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uplo );
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        cblas_cher( cblas_order, cblas_uplo, n, *alpha, xp, incx, ap, lda );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zher( cblas_order, cblas_uplo, n, *alpha, xp, incx, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/her.h: Invalid typename in cblas_her().");
+}
+
+template<typename T, typename Tr>
+static void typed_her( char storage, char uplo, char conj_x, gtint_t n,
+                    Tr* alpha, T* x, gtint_t incx, T* a, gtint_t lda )
+{
+    uplo_t uploa;
+    conj_t conjx;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+    testinghelpers::char_to_blis_conj ( conj_x, &conjx );
+
+    dim_t rsa,csa;
+
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        bli_cher( uploa, conjx, n, alpha, x, incx, a, rsa, csa );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zher( uploa, conjx, n, alpha, x, incx, a, rsa, csa );
+    else
+        throw std::runtime_error("Error in testsuite/level2/her.h: Invalid typename in typed_her().");
+}
+
+template<typename T, typename Tr>
+static void her( char storage, char uploa, char conj_x, gtint_t n,
+                    Tr* alpha, T* xp, gtint_t incx, T* ap, gtint_t lda )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        her_<T>( uploa, n, alpha, xp, incx, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/her.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_her<T>( storage, uploa, n, alpha, xp, incx, ap, lda );
+#elif TEST_BLIS_TYPED
+    typed_her<T>( storage, uploa, conj_x, n, alpha, xp, incx, ap, lda );
+#else
+    throw std::runtime_error("Error in testsuite/level2/her.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/her/test_her.h b/gtestsuite/testsuite/level2/her/test_her.h
new file mode 100644
index 0000000000..ad8a351eb1
--- /dev/null
+++ b/gtestsuite/testsuite/level2/her/test_her.h
@@ -0,0 +1,76 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "her.h"
+#include "level2/ref_her.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T, typename Tr>
+void test_her( char storage, char uploa, char conjx, gtint_t n, Tr alpha,
+               gtint_t incx, gtint_t lda_inc, double thresh, char datatype ) {
+
+    // Compute the leading dimensions of a.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 5, storage, 'n', n, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-3, 3, n, incx, datatype);
+
+    mktrim<T>( storage, uploa, n, a.data(), lda );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> a_ref(a);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    her<T,Tr>( storage, uploa, conjx, n, &alpha, x.data(), incx, a.data(), lda );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_her<T,Tr>( storage, uploa, conjx, n, alpha,
+                                      x.data(), incx, a_ref.data(), lda );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, n, n, a.data(), a_ref.data(), lda, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/her/zher_generic.cpp b/gtestsuite/testsuite/level2/her/zher_generic.cpp
new file mode 100644
index 0000000000..902820d3ca
--- /dev/null
+++ b/gtestsuite/testsuite/level2/her/zher_generic.cpp
@@ -0,0 +1,134 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_her.h"
+
+class zherTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zherTest, RandomData) {
+    using T = dcomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<3>(GetParam());
+    // specifies alpha value
+    double alpha = std::get<4>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<5>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<6>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<7>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 4*std::sqrt(n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_her<T, double>(storage, uploa, conjx, n, alpha, incx, lda_inc, thresh, datatype);
+}
+
+class zherTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,double,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        gtint_t n      = std::get<3>(str.param);
+        double alpha   = std::get<4>(str.param);
+        gtint_t incx   = std::get<5>(str.param);
+        gtint_t ld_inc = std::get<6>(str.param);
+        char datatype  = std::get<7>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zher_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zher";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zher";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conjx;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zherTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(1.0),                                          // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zherTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/her2/cher2_generic.cpp b/gtestsuite/testsuite/level2/her2/cher2_generic.cpp
new file mode 100644
index 0000000000..7c7f16bf72
--- /dev/null
+++ b/gtestsuite/testsuite/level2/her2/cher2_generic.cpp
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_her2.h"
+
+class cher2Test :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(cher2Test, RandomData) {
+    using T = scomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // denotes whether vector y is n,c
+    char conjy = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // stride size for y
+    gtint_t incy = std::get<7>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 4*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_her2<T>(storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh, datatype);
+}
+
+class cher2TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,scomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        char conjy     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        scomplex alpha = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t incy   = std::get<7>(str.param);
+        gtint_t ld_inc = std::get<8>(str.param);
+        char datatype  = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cher2_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cher2";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_cher2";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conjx+conjy;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cher2Test,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Values('n'),                                          // conjy
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{1.0, -2.0}),                          // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cher2TestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/her2/her2.h b/gtestsuite/testsuite/level2/her2/her2.h
new file mode 100644
index 0000000000..759b2d90d2
--- /dev/null
+++ b/gtestsuite/testsuite/level2/her2/her2.h
@@ -0,0 +1,131 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *   A := alpha*x*y**T + alpha*y*x**T + A,
+ * @param[in]     storage specifies the form of storage in the memory matrix A
+ * @param[in]     uploa  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     n      specifies the number  of rows  of the  matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+ * @param[in]     yp     specifies pointer which points to the first element of yp
+ * @param[in]     incy   specifies storage spacing between elements of yp.
+ * @param[in,out] ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ */
+
+template<typename T>
+static void her2_( char uploa, gtint_t n, T* alpha, T* xp, gtint_t incx,
+                              T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+    if constexpr (std::is_same<T, scomplex>::value)
+        cher2_( &uploa, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zher2_( &uploa, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/her2.h: Invalid typename in her2_().");
+}
+
+template<typename T>
+static void cblas_her2( char storage, char uploa, gtint_t n, T* alpha,
+       T* xp, gtint_t incx, T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uplo );
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        cblas_cher2( cblas_order, cblas_uplo, n, alpha, xp, incx, yp, incy, ap, lda );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zher2( cblas_order, cblas_uplo, n, alpha, xp, incx, yp, incy, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/her2.h: Invalid typename in cblas_her2().");
+}
+
+template<typename T>
+static void typed_her2( char storage, char uplo, char conj_x, char conj_y,
+    gtint_t n, T* alpha, T* x, gtint_t incx, T* y, gtint_t incy,
+    T* a, gtint_t lda )
+{
+    uplo_t uploa;
+    conj_t conjx;
+    conj_t conjy;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+    testinghelpers::char_to_blis_conj ( conj_x, &conjx );
+    testinghelpers::char_to_blis_conj ( conj_y, &conjy );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        bli_cher2( uploa, conjx, conjy, n, alpha, x, incx, y, incy, a, rsa, csa );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zher2( uploa, conjx, conjy, n, alpha, x, incx, y, incy, a, rsa, csa );
+    else
+        throw std::runtime_error("Error in testsuite/level2/her2.h: Invalid typename in typed_her2().");
+}
+
+template<typename T>
+static void her2( char storage, char uploa, char conj_x, char conj_y, gtint_t n,
+      T* alpha, T* xp, gtint_t incx, T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        her2_<T>( uploa, n, alpha, xp, incx, yp, incy, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/her2.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_her2<T>( storage, uploa, n, alpha, xp, incx, yp, incy, ap, lda );
+#elif TEST_BLIS_TYPED
+    typed_her2<T>( storage, uploa, conj_x, conj_y, n, alpha, xp, incx, yp, incy, ap, lda );
+#else
+    throw std::runtime_error("Error in testsuite/level2/her2.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/her2/test_her2.h b/gtestsuite/testsuite/level2/her2/test_her2.h
new file mode 100644
index 0000000000..10814b90db
--- /dev/null
+++ b/gtestsuite/testsuite/level2/her2/test_her2.h
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "her2.h"
+#include "level2/ref_her2.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_her2( char storage, char uploa, char conjx, char conjy, gtint_t n,
+    T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh,
+    char datatype ) {
+
+    // Compute the leading dimensions of a.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 5, storage, 'n', n, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-3, 3, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-2, 5, n, incy, datatype);
+
+    mkherm<T>( storage, uploa, n, a.data(), lda );
+    mktrim<T>( storage, uploa, n, a.data(), lda );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> a_ref(a);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    her2<T>( storage, uploa, conjx, conjy, n, &alpha, x.data(), incx,
+                                              y.data(), incy, a.data(), lda );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_her2<T>( storage, uploa, conjx, conjy, n, &alpha,
+                           x.data(), incx, y.data(), incy, a_ref.data(), lda );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, n, n, a.data(), a_ref.data(), lda, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/her2/zher2_generic.cpp b/gtestsuite/testsuite/level2/her2/zher2_generic.cpp
new file mode 100644
index 0000000000..c7bc0bcd9a
--- /dev/null
+++ b/gtestsuite/testsuite/level2/her2/zher2_generic.cpp
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_her2.h"
+
+class zher2Test :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zher2Test, RandomData) {
+    using T = dcomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // denotes whether vector y is n,c
+    char conjy = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // stride size for y
+    gtint_t incy = std::get<7>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 6*std::sqrt(n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_her2<T>(storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh, datatype);
+}
+
+class zher2TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,dcomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        char conjy     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        dcomplex alpha = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t incy   = std::get<7>(str.param);
+        gtint_t ld_inc = std::get<8>(str.param);
+        char datatype  = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zher2_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zher2";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zher2";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conjx+conjy;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zher2Test,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Values('n'),                                          // conjy
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{1.0, -2.0}),                          // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zher2TestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp b/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp
new file mode 100644
index 0000000000..a8ca008deb
--- /dev/null
+++ b/gtestsuite/testsuite/level2/symv/dsymv_generic.cpp
@@ -0,0 +1,153 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_symv.h"
+
+class dsymvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dsymvTest, RandomData) {
+    using T = double;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<2>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<7>(GetParam());
+    // stride size for y
+    gtint_t incy = std::get<8>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_symv<T>(storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh, datatype);
+}
+
+class dsymvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,double,double,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conja     = std::get<2>(str.param);
+        char conjx     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        double alpha   = std::get<5>(str.param);
+        double beta    = std::get<6>(str.param);
+        gtint_t incx   = std::get<7>(str.param);
+        gtint_t incy   = std::get<8>(str.param);
+        gtint_t ld_inc = std::get<9>(str.param);
+        char datatype  = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dsymv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dsymv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dsymv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conja+conjx;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : ("m" + std::to_string(int(std::abs(beta))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_a" + beta_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dsymvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conja
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0, -2.0 ),                                  // alpha
+            ::testing::Values( 2.0, -1.0 ),                                  // beta
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dsymvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp b/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp
new file mode 100644
index 0000000000..498a7b89c9
--- /dev/null
+++ b/gtestsuite/testsuite/level2/symv/ssymv_generic.cpp
@@ -0,0 +1,153 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_symv.h"
+
+class ssymvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ssymvTest, RandomData) {
+    using T = float;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<2>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<7>(GetParam());
+    // stride size for y
+    gtint_t incy = std::get<8>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_symv<T>(storage, uploa, conja, conjx, n, alpha, lda_inc, incx, beta, incy, thresh, datatype);
+}
+
+class ssymvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,float,float,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conja     = std::get<2>(str.param);
+        char conjx     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        float alpha    = std::get<5>(str.param);
+        float beta     = std::get<6>(str.param);
+        gtint_t incx   = std::get<7>(str.param);
+        gtint_t incy   = std::get<8>(str.param);
+        gtint_t ld_inc = std::get<9>(str.param);
+        char datatype  = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ssymv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ssymv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ssymv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conja+conjx;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : ("m" + std::to_string(int(std::abs(beta))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_a" + beta_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ssymvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conja
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0, -2.0 ),                                  // alpha
+            ::testing::Values( 2.0, -1.0 ),                                  // beta
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ssymvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/symv/symv.h b/gtestsuite/testsuite/level2/symv/symv.h
new file mode 100644
index 0000000000..2d77b25de4
--- /dev/null
+++ b/gtestsuite/testsuite/level2/symv/symv.h
@@ -0,0 +1,133 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        y := alpha*A*x + beta*y
+ * @param[in]     storage specifies the form of storage in the memory matrix A
+ * @param[in]     uploa  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     n      specifies the number  of rows  of the  matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ * @param[in]     xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] yp     specifies pointer which points to the first element of yp
+ * @param[in]     incy   specifies storage spacing between elements of yp.
+ */
+
+template<typename T>
+static void symv_( char uploa, gtint_t n, T* alpha, T* ap, gtint_t lda,
+                    T* xp, gtint_t incx, T* beta, T* yp, gtint_t incy )
+{
+    if constexpr (std::is_same<T, float>::value)
+        ssymv_( &uploa, &n, alpha, ap, &lda, xp, &incx, beta, yp, &incy );
+    else if constexpr (std::is_same<T, double>::value)
+        dsymv_( &uploa, &n, alpha, ap, &lda, xp, &incx, beta, yp, &incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/symv.h: Invalid typename in symv_().");
+}
+
+template<typename T>
+static void cblas_symv( char storage, char uploa, gtint_t n, T* alpha,
+    T* ap, gtint_t lda, T* xp, gtint_t incx, T* beta, T* yp, gtint_t incy )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uplo );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_ssymv( cblas_order, cblas_uplo, n, *alpha, ap, lda, xp, incx, *beta, yp, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dsymv( cblas_order, cblas_uplo, n, *alpha, ap, lda, xp, incx, *beta, yp, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/symv.h: Invalid typename in cblas_symv().");
+}
+
+template<typename T>
+static void typed_symv( char storage, char uplo, char conj_a, char conj_x,
+    gtint_t n, T* alpha, T* a, gtint_t lda, T* x, gtint_t incx, T* beta,
+    T* y, gtint_t incy )
+{
+    uplo_t uploa;
+    conj_t conja;
+    conj_t conjx;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+    testinghelpers::char_to_blis_conj ( conj_a, &conja );
+    testinghelpers::char_to_blis_conj ( conj_x, &conjx );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_ssymv( uploa, conja, conjx, n, alpha, a, rsa, csa, x, incx, beta, y, incy );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dsymv( uploa, conja, conjx, n, alpha, a, rsa, csa, x, incx, beta, y, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/symv.h: Invalid typename in typed_symv().");
+}
+
+template<typename T>
+static void symv( char storage, char uploa, char conja, char conjx, gtint_t n,
+    T* alpha, T* ap, gtint_t lda, T* xp, gtint_t incx, T* beta, T* yp,
+    gtint_t incy )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        symv_<T>( uploa, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+    else
+        throw std::runtime_error("Error in testsuite/level2/symv.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_symv<T>( storage, uploa, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+#elif TEST_BLIS_TYPED
+    typed_symv<T>( storage, uploa, conja, conjx, n, alpha, ap, lda, xp, incx, beta, yp, incy );
+#else
+    throw std::runtime_error("Error in testsuite/level2/symv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/symv/test_symv.h b/gtestsuite/testsuite/level2/symv/test_symv.h
new file mode 100644
index 0000000000..22c556d346
--- /dev/null
+++ b/gtestsuite/testsuite/level2/symv/test_symv.h
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "symv.h"
+#include "level2/ref_symv.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_symv( char storage, char uploa, char conja, char conjx, gtint_t n,
+    T alpha, gtint_t lda_inc, gtint_t incx, T beta, gtint_t incy,
+    double thresh, char datatype ) {
+
+    // Compute the leading dimensions of a.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 5, storage, 'n', n, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-3, 3, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-2, 5, n, incy, datatype);
+
+    mksymm<T>( storage, uploa, n, a.data(), lda );
+    mktrim<T>( storage, uploa, n, a.data(), lda );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> y_ref(y);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    symv<T>( storage, uploa, conja, conjx, n, &alpha, a.data(), lda,
+                                  x.data(), incx, &beta, y.data(), incy );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_symv<T>( storage, uploa, conja, conjx, n, &alpha,
+                 a.data(), lda, x.data(), incx, &beta, y_ref.data(), incy );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, y.data(), y_ref.data(), incy, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp b/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp
new file mode 100644
index 0000000000..d80e990298
--- /dev/null
+++ b/gtestsuite/testsuite/level2/syr/dsyr_generic.cpp
@@ -0,0 +1,134 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syr.h"
+
+class dsyrTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dsyrTest, RandomData) {
+    using T = double;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<3>(GetParam());
+    // specifies alpha value
+    double alpha = std::get<4>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<5>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<6>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<7>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syr<T>(storage, uploa, conjx, n, alpha, incx, lda_inc, thresh, datatype);
+}
+
+class dsyrTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,double,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        gtint_t n      = std::get<3>(str.param);
+        double alpha   = std::get<4>(str.param);
+        gtint_t incx   = std::get<5>(str.param);
+        gtint_t ld_inc = std::get<6>(str.param);
+        char datatype  = std::get<7>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dsyr_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dsyr";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dsyr";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conjx;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dsyrTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(1.0),                                          // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dsyrTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp b/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp
new file mode 100644
index 0000000000..9e44b518f6
--- /dev/null
+++ b/gtestsuite/testsuite/level2/syr/ssyr_generic.cpp
@@ -0,0 +1,134 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syr.h"
+
+class ssyrTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ssyrTest, RandomData) {
+    using T = float;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<3>(GetParam());
+    // specifies alpha value
+    float alpha = std::get<4>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<5>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<6>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<7>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syr<T>(storage, uploa, conjx, n, alpha, incx, lda_inc, thresh, datatype);
+}
+
+class ssyrTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,float,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        gtint_t n      = std::get<3>(str.param);
+        float alpha    = std::get<4>(str.param);
+        gtint_t incx   = std::get<5>(str.param);
+        gtint_t ld_inc = std::get<6>(str.param);
+        char datatype  = std::get<7>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ssyr_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ssyr";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ssyr";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conjx;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ssyrTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(1.0),                                          // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ssyrTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/syr/syr.h b/gtestsuite/testsuite/level2/syr/syr.h
new file mode 100644
index 0000000000..e16d5c5322
--- /dev/null
+++ b/gtestsuite/testsuite/level2/syr/syr.h
@@ -0,0 +1,128 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *          A := alpha*x*x**T + A
+ * @param[in]     storage specifies the form of storage in the memory matrix A
+ * @param[in]     uploa  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     m      specifies the number  of rows  of the  matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+ * @param[in,out] ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ */
+
+
+template<typename T>
+static void syr_( char uploa, gtint_t n, T* alpha, T* xp, gtint_t incx,
+                                                  T* ap, gtint_t lda )
+{
+    if constexpr (std::is_same<T, float>::value)
+        ssyr_( &uploa, &n, alpha, xp, &incx, ap, &lda );
+    else if constexpr (std::is_same<T, double>::value)
+        dsyr_( &uploa, &n, alpha, xp, &incx, ap, &lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/syr.h: Invalid typename in syr_().");
+}
+
+template<typename T>
+static void cblas_syr( char storage, char uploa, gtint_t n, T* alpha,
+                            T* xp, gtint_t incx, T* ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uplo );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_ssyr( cblas_order, cblas_uplo, n, *alpha, xp, incx, ap, lda );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dsyr( cblas_order, cblas_uplo, n, *alpha, xp, incx, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/syr.h: Invalid typename in cblas_syr().");
+}
+
+template<typename T>
+static void typed_syr( char storage, char uplo, char conj_x, gtint_t n,
+                    T* alpha, T* x, gtint_t incx, T* a, gtint_t lda )
+{
+    uplo_t uploa;
+    conj_t conjx;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+    testinghelpers::char_to_blis_conj ( conj_x, &conjx );
+
+    dim_t rsa,csa;
+
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_ssyr( uploa, conjx, n, alpha, x, incx, a, rsa, csa );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dsyr( uploa, conjx, n, alpha, x, incx, a, rsa, csa );
+    else
+        throw std::runtime_error("Error in testsuite/level2/syr.h: Invalid typename in typed_syr().");
+}
+
+template<typename T>
+static void syr( char storage, char uploa, char conj_x, gtint_t n, T* alpha,
+                               T* xp, gtint_t incx, T* ap, gtint_t lda )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        syr_<T>( uploa, n, alpha, xp, incx, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/syr.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_syr<T>( storage, uploa, n, alpha, xp, incx, ap, lda );
+#elif TEST_BLIS_TYPED
+    typed_syr<T>( storage, uploa, conj_x, n, alpha, xp, incx, ap, lda );
+#else
+    throw std::runtime_error("Error in testsuite/level2/syr.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/syr/test_syr.h b/gtestsuite/testsuite/level2/syr/test_syr.h
new file mode 100644
index 0000000000..d8cc9e9ada
--- /dev/null
+++ b/gtestsuite/testsuite/level2/syr/test_syr.h
@@ -0,0 +1,76 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "syr.h"
+#include "level2/ref_syr.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_syr( char storage, char uploa, char conjx, gtint_t n, T alpha,
+               gtint_t incx, gtint_t lda_inc, double thresh, char datatype ) {
+
+    // Compute the leading dimensions for matrix size calculation.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 5, storage, 'n', n, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-3, 3, n, incx, datatype);
+
+    mktrim<T>( storage, uploa, n, a.data(), lda );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> a_ref(a);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    syr<T>( storage, uploa, conjx, n, &alpha, x.data(), incx, a.data(), lda );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_syr<T>( storage, uploa, conjx, n, alpha,
+                                      x.data(), incx, a_ref.data(), lda );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, n, n, a.data(), a_ref.data(), lda, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp b/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp
new file mode 100644
index 0000000000..896323648c
--- /dev/null
+++ b/gtestsuite/testsuite/level2/syr2/dsyr2_generic.cpp
@@ -0,0 +1,146 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syr2.h"
+
+class dsyr2Test :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dsyr2Test, RandomData) {
+    using T = double;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // denotes whether vector y is n,c
+    char conjy = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // stride size for y
+    gtint_t incy = std::get<7>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 3*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syr2<T>(storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh, datatype);
+}
+
+class dsyr2TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,double,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        char conjy     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        double alpha   = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t incy   = std::get<7>(str.param);
+        gtint_t ld_inc = std::get<8>(str.param);
+        char datatype  = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dsyr2_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dsyr2";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dsyr2";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conjx+conjy;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dsyr2Test,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Values('n'),                                          // conjy
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(1.0, -2.0),                                    // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dsyr2TestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp b/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp
new file mode 100644
index 0000000000..ced6dfdd89
--- /dev/null
+++ b/gtestsuite/testsuite/level2/syr2/ssyr2_generic.cpp
@@ -0,0 +1,146 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syr2.h"
+
+class ssyr2Test :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ssyr2Test, RandomData) {
+    using T = float;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether vector x is n,c
+    char conjx = std::get<2>(GetParam());
+    // denotes whether vector y is n,c
+    char conjy = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // stride size for y
+    gtint_t incy = std::get<7>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 3*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syr2<T>(storage, uploa, conjx, conjy, n, alpha, incx, incy, lda_inc, thresh, datatype);
+}
+
+class ssyr2TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,float,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char conjx     = std::get<2>(str.param);
+        char conjy     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        float alpha    = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t incy   = std::get<7>(str.param);
+        gtint_t ld_inc = std::get<8>(str.param);
+        char datatype  = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ssyr2_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ssyr2";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ssyr2";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+conjx+conjy;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        std::string incy_str = ( incy > 0) ? std::to_string(incy) : "m" + std::to_string(std::abs(incy));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + incy_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ssyr2Test,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n'),                                          // conjx
+            ::testing::Values('n'),                                          // conjy
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(1.0, -2.0),                                    // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(1)),                                   // stride size for y
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ssyr2TestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/syr2/syr2.h b/gtestsuite/testsuite/level2/syr2/syr2.h
new file mode 100644
index 0000000000..dd51b5497b
--- /dev/null
+++ b/gtestsuite/testsuite/level2/syr2/syr2.h
@@ -0,0 +1,131 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *              A := alpha*x*y**T + alpha*y*x**T + A,
+ * @param[in]     storage specifies the form of storage in the memory matrix A
+ * @param[in]     uploa  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     n      specifies the number  of rows  of the  matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+ * @param[in]     yp     specifies pointer which points to the first element of yp
+ * @param[in]     incy   specifies storage spacing between elements of yp.
+ * @param[in,out] ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ */
+
+template<typename T>
+static void syr2_( char uploa, gtint_t n, T* alpha, T* xp, gtint_t incx,
+                              T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+    if constexpr (std::is_same<T, float>::value)
+        ssyr2_( &uploa, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+    else if constexpr (std::is_same<T, double>::value)
+        dsyr2_( &uploa, &n, alpha, xp, &incx, yp, &incy, ap, &lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/syr2.h: Invalid typename in syr2_().");
+}
+
+template<typename T>
+static void cblas_syr2( char storage, char uploa, gtint_t n, T* alpha,
+       T* xp, gtint_t incx, T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uplo );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_ssyr2( cblas_order, cblas_uplo, n, *alpha, xp, incx, yp, incy, ap, lda );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dsyr2( cblas_order, cblas_uplo, n, *alpha, xp, incx, yp, incy, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/syr2.h: Invalid typename in cblas_syr2().");
+}
+
+template<typename T>
+static void typed_syr2( char storage, char uplo, char conj_x, char conj_y,
+    gtint_t n, T* alpha, T* x, gtint_t incx, T* y, gtint_t incy,
+    T* a, gtint_t lda )
+{
+    uplo_t uploa;
+    conj_t conjx;
+    conj_t conjy;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+    testinghelpers::char_to_blis_conj ( conj_x, &conjx );
+    testinghelpers::char_to_blis_conj ( conj_y, &conjy );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_ssyr2( uploa, conjx, conjy, n, alpha, x, incx, y, incy, a, rsa, csa );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dsyr2( uploa, conjx, conjy, n, alpha, x, incx, y, incy, a, rsa, csa );
+    else
+        throw std::runtime_error("Error in testsuite/level2/syr2.h: Invalid typename in typed_syr2().");
+}
+
+template<typename T>
+static void syr2( char storage, char uploa, char conj_x, char conj_y, gtint_t n,
+      T* alpha, T* xp, gtint_t incx, T* yp, gtint_t incy, T* ap, gtint_t lda )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        syr2_<T>( uploa, n, alpha, xp, incx, yp, incy, ap, lda );
+    else
+        throw std::runtime_error("Error in testsuite/level2/syr2.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_syr2<T>( storage, uploa, n, alpha, xp, incx, yp, incy, ap, lda );
+#elif TEST_BLIS_TYPED
+    typed_syr2<T>( storage, uploa, conj_x, conj_y, n, alpha, xp, incx, yp, incy, ap, lda );
+#else
+    throw std::runtime_error("Error in testsuite/level2/syr2.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/syr2/test_syr2.h b/gtestsuite/testsuite/level2/syr2/test_syr2.h
new file mode 100644
index 0000000000..92b8b64baa
--- /dev/null
+++ b/gtestsuite/testsuite/level2/syr2/test_syr2.h
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "syr2.h"
+#include "level2/ref_syr2.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_syr2( char storage, char uploa, char conjx, char conjy, gtint_t n,
+    T alpha, gtint_t incx, gtint_t incy, gtint_t lda_inc, double thresh,
+    char datatype ) {
+
+    // Compute the leading dimensions for matrix size calculation.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', n, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 5, storage, 'n', n, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-3, 3, n, incx, datatype);
+    std::vector<T> y = testinghelpers::get_random_vector<T>(-3, 3, n, incy, datatype);
+
+    mksymm<T>( storage, uploa, n, a.data(), lda );
+    mktrim<T>( storage, uploa, n, a.data(), lda );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> a_ref(a);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    syr2<T>( storage, uploa, conjx, conjy, n, &alpha, x.data(), incx,
+                                              y.data(), incy, a.data(), lda );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_syr2<T>( storage, uploa, conjx, conjy, n, alpha,
+                           x.data(), incx, y.data(), incy, a_ref.data(), lda );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, n, n, a.data(), a_ref.data(), lda, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp
new file mode 100644
index 0000000000..61f048c70d
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trmv/ctrmv_generic.cpp
@@ -0,0 +1,145 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmv.h"
+
+class ctrmvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ctrmvTest, RandomData) {
+    using T = scomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix diag is u,n
+    char diaga = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<8>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmv<T>(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype);
+}
+
+class ctrmvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,scomplex,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char transa    = std::get<2>(str.param);
+        char diaga     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        scomplex alpha = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t ld_inc = std::get<7>(str.param);
+        char datatype  = std::get<8>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ctrmv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ctrmv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ctrmv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+transa;
+        str_name    = str_name + "_d" + diaga;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ctrmvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','u'),                                      // diaga , n=NONUNIT_DIAG u=UNIT_DIAG
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{1.0, 0.0}
+#ifdef TEST_BLIS_TYPED
+            , scomplex{1.0, -2.0}
+#endif
+            ),                                                               // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(9)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ctrmvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp
new file mode 100644
index 0000000000..869cc69744
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trmv/dtrmv_generic.cpp
@@ -0,0 +1,144 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmv.h"
+
+class dtrmvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dtrmvTest, RandomData) {
+    using T = double;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix diag is u,n
+    char diaga = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<8>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 20*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmv<T>(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype);
+}
+
+class dtrmvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,double,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char transa    = std::get<2>(str.param);
+        char diaga     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        double alpha   = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t ld_inc = std::get<7>(str.param);
+        char datatype  = std::get<8>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dtrmv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dtrmv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dtrmv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+transa;
+        str_name    = str_name + "_d" + diaga;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dtrmvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','u'),                                      // diaga
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0
+#ifdef TEST_BLIS_TYPED
+            , -2.0
+#endif
+            ),                                                               // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dtrmvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp
new file mode 100644
index 0000000000..18bbd93b77
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trmv/strmv_generic.cpp
@@ -0,0 +1,144 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmv.h"
+
+class strmvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(strmvTest, RandomData) {
+    using T = float;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix diag is u,n
+    char diaga = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<8>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmv<T>(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype);
+}
+
+class strmvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,float,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char transa    = std::get<2>(str.param);
+        char diaga     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        float alpha    = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t ld_inc = std::get<7>(str.param);
+        char datatype  = std::get<8>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "strmv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_strmv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_strmv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+transa;
+        str_name    = str_name + "_d" + diaga;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        strmvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','u'),                                      // diaga , n=NONUNIT_DIAG u=UNIT_DIAG
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0
+#ifdef TEST_BLIS_TYPED
+            , -2.0
+#endif
+            ),                                                               // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::strmvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/trmv/test_trmv.h b/gtestsuite/testsuite/level2/trmv/test_trmv.h
new file mode 100644
index 0000000000..82d8b0d6a3
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trmv/test_trmv.h
@@ -0,0 +1,75 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "trmv.h"
+#include "level2/ref_trmv.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_trmv( char storage, char uploa, char transa, char diaga, gtint_t n,
+    T alpha, gtint_t lda_inc, gtint_t incx,  double thresh, char datatype ) {
+
+    // Compute the leading dimensions for matrix size calculation.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, n, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 8, storage, transa, n, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(-10, 10, n, incx, datatype);
+
+    mktrim<T>( storage, uploa, n, a.data(), lda );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> x_ref(x);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    trmv<T>( storage, uploa, transa, diaga, n, &alpha, a.data(), lda, x.data(), incx );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_trmv<T>( storage, uploa, transa, diaga, n, &alpha, a.data(), lda, x_ref.data(), incx );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, x.data(), x_ref.data(), incx, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/trmv/trmv.h b/gtestsuite/testsuite/level2/trmv/trmv.h
new file mode 100644
index 0000000000..8ee3750a62
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trmv/trmv.h
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation
+  *    x := alpha * transa(A) * x
+ * @param[in]     storage specifies the form of storage in the memory matrix A
+ * @param[in]     uploa  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     transa specifies the form of op( A ) to be used in matrix multiplication
+ * @param[in]     diaga  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     n      specifies the number  of rows  of the  matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ * @param[in,out] xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+
+ */
+
+template<typename T>
+static void trmv_( char uploa, char transa, char diaga, gtint_t n,
+                         T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+    if constexpr (std::is_same<T, float>::value)
+        strmv_( &uploa, &transa, &diaga, &n, ap, &lda, xp, &incx );
+    else if constexpr (std::is_same<T, double>::value)
+        dtrmv_( &uploa, &transa, &diaga, &n, ap, &lda, xp, &incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        ctrmv_( &uploa, &transa, &diaga, &n, ap, &lda, xp, &incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        ztrmv_( &uploa, &transa, &diaga, &n, ap, &lda, xp, &incx );
+    else
+        throw std::runtime_error("Error in testsuite/level2/trmv.h: Invalid typename in trmv_().");
+}
+
+template<typename T>
+static void cblas_trmv( char storage, char uploa, char transa, char diaga,
+                      gtint_t n, T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_DIAG cblas_diaga;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uploa );
+    testinghelpers::char_to_cblas_trans( transa, &cblas_transa );
+    testinghelpers::char_to_cblas_diag( diaga, &cblas_diaga );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_strmv( cblas_order, cblas_uploa, cblas_transa, cblas_diaga, n, ap, lda, xp, incx );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dtrmv( cblas_order, cblas_uploa, cblas_transa, cblas_diaga, n, ap, lda, xp, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_ctrmv( cblas_order, cblas_uploa, cblas_transa, cblas_diaga, n, ap, lda, xp, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_ztrmv( cblas_order, cblas_uploa, cblas_transa, cblas_diaga, n, ap, lda, xp, incx );
+    else
+        throw std::runtime_error("Error in testsuite/level2/trmv.h: Invalid typename in cblas_trmv().");
+}
+
+template<typename T>
+static void typed_trmv( char storage, char uplo, char trans, char diag,
+            gtint_t n, T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+    uplo_t  uploa;
+    trans_t transa;
+    diag_t  diaga;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+    testinghelpers::char_to_blis_trans( trans, &transa );
+    testinghelpers::char_to_blis_diag ( diag, &diaga );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_strmv( uploa, transa, diaga, n, alpha, ap, rsa, csa, xp, incx );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dtrmv( uploa, transa, diaga, n, alpha, ap, rsa, csa, xp, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_ctrmv( uploa, transa, diaga, n, alpha, ap, rsa, csa, xp, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_ztrmv( uploa, transa, diaga, n, alpha, ap, rsa, csa, xp, incx );
+    else
+
+        throw std::runtime_error("Error in testsuite/level2/trmv.h: Invalid typename in typed_trmv().");
+}
+
+template<typename T>
+static void trmv( char storage, char uploa, char transa, char diaga,
+    gtint_t n, T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+#if (defined TEST_BLAS || defined  TEST_CBLAS)
+    T one;
+    testinghelpers::initone(one);
+#endif
+
+#ifdef TEST_BLAS
+    if(( storage == 'c' || storage == 'C' ))
+        if( *alpha == one )
+            trmv_<T>( uploa, transa, diaga, n, ap, lda, xp, incx );
+        else
+            throw std::runtime_error("Error in testsuite/level2/trmv.h: BLAS interface cannot be tested for alpha != one.");
+    else
+        throw std::runtime_error("Error in testsuite/level2/trmv.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    if( *alpha == one )
+        cblas_trmv<T>( storage, uploa, transa, diaga, n, ap, lda, xp, incx );
+    else
+      throw std::runtime_error("Error in testsuite/level2/trmv.h: CBLAS interface cannot be tested for alpha != one.");
+#elif TEST_BLIS_TYPED
+    typed_trmv<T>( storage, uploa, transa, diaga, n, alpha, ap, lda, xp, incx );
+#else
+    throw std::runtime_error("Error in testsuite/level2/trmv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp b/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp
new file mode 100644
index 0000000000..759202433d
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trmv/ztrmv_generic.cpp
@@ -0,0 +1,145 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmv.h"
+
+class ztrmvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ztrmvTest, RandomData) {
+    using T = dcomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix diag is u,n
+    char diaga = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<8>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmv<T>(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype);
+}
+
+class ztrmvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,dcomplex,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char transa    = std::get<2>(str.param);
+        char diaga     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        dcomplex alpha = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t ld_inc = std::get<7>(str.param);
+        char datatype  = std::get<8>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ztrmv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ztrmv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ztrmv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+transa;
+        str_name    = str_name + "_d" + diaga;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ztrmvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','u'),                                      // diaga , n=NONUNIT_DIAG u=UNIT_DIAG
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{1.0, 0.0}
+#ifdef TEST_BLIS_TYPED
+            ,dcomplex{1.0, -2.0}
+#endif
+            ),                                                               // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ztrmvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp
new file mode 100644
index 0000000000..45421b8f97
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trsv/ctrsv_generic.cpp
@@ -0,0 +1,145 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trsv.h"
+
+class ctrsvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ctrsvTest, RandomData) {
+    using T = scomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix diag is u,n
+    char diaga = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<8>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 5*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trsv<T>(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype);
+}
+
+class ctrsvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,scomplex,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char transa    = std::get<2>(str.param);
+        char diaga     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        scomplex alpha = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t ld_inc = std::get<7>(str.param);
+        char datatype  = std::get<8>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ctrsv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ctrsv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ctrsv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+transa;
+        str_name    = str_name + "_d" + diaga;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ctrsvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','u'),                                      // diaga , n=NONUNIT_DIAG u=UNIT_DIAG
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{1.0, 0.0}
+#ifdef TEST_BLIS_TYPED
+            , scomplex{1.0, -2.0}
+#endif
+            ),                                                               // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ctrsvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp
new file mode 100644
index 0000000000..2a4e1c6cac
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trsv/dtrsv_generic.cpp
@@ -0,0 +1,144 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trsv.h"
+
+class dtrsvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dtrsvTest, RandomData) {
+    using T = double;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix diag is u,n
+    char diaga = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<8>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 100*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trsv<T>(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype);
+}
+
+class dtrsvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,double,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char transa    = std::get<2>(str.param);
+        char diaga     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        double alpha   = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t ld_inc = std::get<7>(str.param);
+        char datatype  = std::get<8>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dtrsv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dtrsv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dtrsv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+transa;
+        str_name    = str_name + "_d" + diaga;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dtrsvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','u'),                                      // diaga , n=NONUNIT_DIAG u=UNIT_DIAG
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0
+#ifdef TEST_BLIS_TYPED
+            , -2.0
+#endif
+            ),                                                               // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dtrsvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp
new file mode 100644
index 0000000000..edd0197070
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trsv/strsv_generic.cpp
@@ -0,0 +1,144 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trsv.h"
+
+class strsvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(strsvTest, RandomData) {
+    using T = float;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix diag is u,n
+    char diaga = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<8>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 20*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trsv<T>(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype);
+}
+
+class strsvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,float,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char transa    = std::get<2>(str.param);
+        char diaga     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        float alpha    = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t ld_inc = std::get<7>(str.param);
+        char datatype  = std::get<8>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "strsv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_strsv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_strsv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+transa;
+        str_name    = str_name + "_d" + diaga;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : ("m" + std::to_string(int(std::abs(alpha))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        strsvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','u'),                                      // diaga , n=NONUNIT_DIAG u=UNIT_DIAG
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0
+#ifdef TEST_BLIS_TYPED
+            , -2.0
+#endif
+            ),                                                               // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(7)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::strsvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level2/trsv/test_trsv.h b/gtestsuite/testsuite/level2/trsv/test_trsv.h
new file mode 100644
index 0000000000..320459c862
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trsv/test_trsv.h
@@ -0,0 +1,75 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "trsv.h"
+#include "level2/ref_trsv.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_trsv( char storage, char uploa, char transa, char diaga, gtint_t n,
+    T alpha, gtint_t lda_inc, gtint_t incx,  double thresh, char datatype ) {
+
+    // Compute the leading dimensions for matrix size calculation.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, n, n, lda_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(1, 5, storage, transa, n, n, lda, datatype);
+    std::vector<T> x = testinghelpers::get_random_vector<T>(1, 3, n, incx, datatype);
+
+    mktrim<T>( storage, uploa, n, a.data(), lda );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> x_ref(x);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    trsv<T>( storage, uploa, transa, diaga, n, &alpha, a.data(), lda, x.data(), incx );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_trsv<T>( storage, uploa, transa, diaga, n, &alpha, a.data(), lda, x_ref.data(), incx );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( n, x.data(), x_ref.data(), incx, thresh );
+}
diff --git a/gtestsuite/testsuite/level2/trsv/trsv.h b/gtestsuite/testsuite/level2/trsv/trsv.h
new file mode 100644
index 0000000000..65ca33112a
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trsv/trsv.h
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+  *    x := alpha * inv(transa(A)) * x_orig
+ * @param[in]     storage specifies the form of storage in the memory matrix A
+ * @param[in]     uploa  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     transa specifies the form of op( A ) to be used in matrix multiplication
+ * @param[in]     diaga  specifies whether the upper or lower triangular part of the array A
+ * @param[in]     n      specifies the number  of rows  of the  matrix A
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     lda    specifies leading dimension of the matrix.
+ * @param[in,out] xp     specifies pointer which points to the first element of xp
+ * @param[in]     incx   specifies storage spacing between elements of xp.
+
+ */
+
+template<typename T>
+static void trsv_( char uploa, char transa, char diaga, gtint_t n,
+                         T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+    if constexpr (std::is_same<T, float>::value)
+        strsv_( &uploa, &transa, &diaga, &n, ap, &lda, xp, &incx );
+    else if constexpr (std::is_same<T, double>::value)
+        dtrsv_( &uploa, &transa, &diaga, &n, ap, &lda, xp, &incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        ctrsv_( &uploa, &transa, &diaga, &n, ap, &lda, xp, &incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        ztrsv_( &uploa, &transa, &diaga, &n, ap, &lda, xp, &incx );
+    else
+        throw std::runtime_error("Error in testsuite/level2/trsv.h: Invalid typename in trsv_().");
+}
+
+template<typename T>
+static void cblas_trsv( char storage, char uploa, char transa, char diaga,
+                      gtint_t n, T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uploa;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_DIAG cblas_diaga;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uploa );
+    testinghelpers::char_to_cblas_trans( transa, &cblas_transa );
+    testinghelpers::char_to_cblas_diag( diaga, &cblas_diaga );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_strsv( cblas_order, cblas_uploa, cblas_transa, cblas_diaga, n, ap, lda, xp, incx );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dtrsv( cblas_order, cblas_uploa, cblas_transa, cblas_diaga, n, ap, lda, xp, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_ctrsv( cblas_order, cblas_uploa, cblas_transa, cblas_diaga, n, ap, lda, xp, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_ztrsv( cblas_order, cblas_uploa, cblas_transa, cblas_diaga, n, ap, lda, xp, incx );
+    else
+        throw std::runtime_error("Error in testsuite/level2/trsv.h: Invalid typename in cblas_trsv().");
+}
+
+template<typename T>
+static void typed_trsv( char storage, char uplo, char trans, char diag,
+            gtint_t n, T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+    uplo_t  uploa;
+    trans_t transa;
+    diag_t  diaga;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_uplo ( uplo, &uploa );
+    testinghelpers::char_to_blis_trans( trans, &transa );
+    testinghelpers::char_to_blis_diag ( diag, &diaga );
+
+    dim_t rsa,csa;
+    rsa=csa=1;
+    /* a = n x n   */
+    if( (storage == 'c') || (storage == 'C') )
+        csa = lda ;
+    else if( (storage == 'r') || (storage == 'R') )
+        rsa = lda ;
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_strsv( uploa, transa, diaga, n, alpha, ap, rsa, csa, xp, incx );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dtrsv( uploa, transa, diaga, n, alpha, ap, rsa, csa, xp, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_ctrsv( uploa, transa, diaga, n, alpha, ap, rsa, csa, xp, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_ztrsv( uploa, transa, diaga, n, alpha, ap, rsa, csa, xp, incx );
+    else
+
+        throw std::runtime_error("Error in testsuite/level2/trsv.h: Invalid typename in typed_trsv().");
+}
+
+template<typename T>
+static void trsv( char storage, char uploa, char transa, char diaga,
+    gtint_t n, T *alpha, T *ap, gtint_t lda, T *xp, gtint_t incx )
+{
+#if (defined TEST_BLAS || defined  TEST_CBLAS)
+    T one;
+    testinghelpers::initone(one);
+#endif
+
+#ifdef TEST_BLAS
+    if(( storage == 'c' || storage == 'C' ))
+        if( *alpha == one )
+            trsv_<T>( uploa, transa, diaga, n, ap, lda, xp, incx );
+        else
+            throw std::runtime_error("Error in testsuite/level2/trsv.h: BLAS interface cannot be tested for alpha != one.");
+    else
+        throw std::runtime_error("Error in testsuite/level2/trsv.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    if( *alpha == one )
+        cblas_trsv<T>( storage, uploa, transa, diaga, n, ap, lda, xp, incx );
+    else
+      throw std::runtime_error("Error in testsuite/level2/trsv.h: CBLAS interface cannot be tested for alpha != one.");
+#elif TEST_BLIS_TYPED
+    typed_trsv<T>( storage, uploa, transa, diaga, n, alpha, ap, lda, xp, incx );
+#else
+    throw std::runtime_error("Error in testsuite/level2/trsv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp b/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp
new file mode 100644
index 0000000000..e3232f0229
--- /dev/null
+++ b/gtestsuite/testsuite/level2/trsv/ztrsv_generic.cpp
@@ -0,0 +1,145 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trsv.h"
+
+class ztrsvTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ztrsvTest, RandomData) {
+    using T = dcomplex;
+
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is u,l
+    char uploa = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix diag is u,n
+    char diaga = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<6>(GetParam());
+    // lda increment.
+    // If increment is zero, then the array size matches the matrix size.
+    // If increment are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<8>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trsv<T>(storage, uploa, transa, diaga, n, alpha, lda_inc, incx, thresh, datatype);
+}
+
+class ztrsvTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,dcomplex,gtint_t,gtint_t,char>> str) const {
+        char sfm       = std::get<0>(str.param);
+        char uploa     = std::get<1>(str.param);
+        char transa    = std::get<2>(str.param);
+        char diaga     = std::get<3>(str.param);
+        gtint_t n      = std::get<4>(str.param);
+        dcomplex alpha = std::get<5>(str.param);
+        gtint_t incx   = std::get<6>(str.param);
+        gtint_t ld_inc = std::get<7>(str.param);
+        char datatype  = std::get<8>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ztrsv_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ztrsv";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ztrsv";
+#endif
+        str_name    = str_name + "_" + sfm;
+        str_name    = str_name + "_" + uploa+transa;
+        str_name    = str_name + "_d" + diaga;
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name    = str_name + "_a" + alpha_str;
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + std::to_string(ld_inc);
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ztrsvTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uploa
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','u'),                                      // diaga , n=NONUNIT_DIAG u=UNIT_DIAG
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{1.0, 0.0}
+#ifdef TEST_BLIS_TYPED
+            ,dcomplex{1.0, -2.0}
+#endif
+            ),                                                               // alpha
+            ::testing::Values(gtint_t(1)),                                   // stride size for x
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ztrsvTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp
new file mode 100644
index 0000000000..fa6b10006a
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemm/cgemm_generic.cpp
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemm.h"
+
+class CGemmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(CGemmTest, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<1>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemm<T>(storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class CGemmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,gtint_t,gtint_t,gtint_t,scomplex,scomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char tsa        = std::get<1>(str.param);
+        char tsb        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t n       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        scomplex alpha  = std::get<6>(str.param);
+        scomplex beta   = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cgemm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cgemm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_cgemm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        CGemmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','c','t'),                                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(scomplex{2.0,-1.0}),                           // alpha
+            ::testing::Values(scomplex{1.0,2.0}),                            // beta
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::CGemmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp
new file mode 100644
index 0000000000..5a7bcbd910
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemm/dgemm_generic.cpp
@@ -0,0 +1,155 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemm.h"
+
+class DGemmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(DGemmTest, RandomData) {
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<1>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*m*n*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemm<T>(storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class DGemmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, gtint_t, double, double, gtint_t, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char tsa        = std::get<1>(str.param);
+        char tsb        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t n       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        double alpha    = std::get<6>(str.param);
+        double beta     = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dgemm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dgemm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dgemm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        DGemmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','t'),                                      // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(-1.0,  1.0),                                   // beta
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(7)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::DGemmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/gemm/gemm.h b/gtestsuite/testsuite/level3/gemm/gemm.h
new file mode 100644
index 0000000000..907f078848
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemm/gemm.h
@@ -0,0 +1,167 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        C := alpha*op( A )*op( B ) + beta*C,
+ * where  op( A ) is one of
+ *        op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H,
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     transb specifies the form of op( B ) to be used in
+                         the matrix multiplication
+ * @param[in]     m      specifies  the number  of rows  of the  matrix
+                         op( A )  and of the  matrix  C
+ * @param[in]     n      specifies the number  of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     k      specifies  the number of columns of the matrix
+                         op( A ) and the number of rows of the matrix op( B ).
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T>
+static void gemm_(char transa, char transb, gtint_t m, gtint_t n, gtint_t k, T* alpha,
+                    T* ap, gtint_t lda,  T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+    if constexpr (std::is_same<T, float>::value)
+        sgemm_( &transa, &transb, &m, &n, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        dgemm_( &transa, &transb, &m, &n, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cgemm_( &transa, &transb, &m, &n, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zgemm_( &transa, &transb, &m, &n, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/gemm.h: Invalid typename in gemm_().");
+}
+
+template<typename T>
+static void cblas_gemm(char storage, char transa, char transb,
+    gtint_t m, gtint_t n, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_TRANSPOSE cblas_transb;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_trans( transa, &cblas_transa );
+    testinghelpers::char_to_cblas_trans( transb, &cblas_transb );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_sgemm( cblas_order, cblas_transa, cblas_transb, m, n, k, *alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dgemm( cblas_order, cblas_transa, cblas_transb, m, n, k, *alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_cgemm( cblas_order, cblas_transa, cblas_transb, m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zgemm( cblas_order, cblas_transa, cblas_transb, m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/gemm.h: Invalid typename in cblas_gemm().");
+}
+
+template<typename T>
+static void typed_gemm(char storage, char trnsa, char trnsb,
+    gtint_t m, gtint_t n, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    trans_t transa, transb;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_trans( trnsa, &transa );
+    testinghelpers::char_to_blis_trans( trnsb, &transb );
+
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+    dim_t rsc,csc;
+
+    rsa=rsb=rsc=1;
+    csa=csb=csc=1;
+    /* a = m x k       b = k x n       c = m x n    */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_sgemm( transa, transb, m, n, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dgemm( transa, transb, m, n, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cgemm( transa, transb, m, n, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zgemm( transa, transb, m, n, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/gemm.h: Invalid typename in typed_gemm().");
+}
+
+template<typename T>
+static void gemm( char storage, char transa, char transb, gtint_t m, gtint_t n, gtint_t k,
+    T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        gemm_<T>( transa, transb, m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/gemm.h: BLAS interface cannot be tested for row-major order.");
+
+#elif TEST_CBLAS
+    cblas_gemm<T>( storage, transa, transb, m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#elif TEST_BLIS_TYPED
+    typed_gemm<T>( storage, transa, transb, m, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#else
+    throw std::runtime_error("Error in testsuite/level3/gemm.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp
new file mode 100644
index 0000000000..f1f7bec8cf
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemm/sgemm_generic.cpp
@@ -0,0 +1,155 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemm.h"
+
+class SGemmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(SGemmTest, RandomData) {
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<1>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemm<T>(storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class SGemmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, gtint_t, float, float, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char tsa        = std::get<1>(str.param);
+        char tsb        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t n       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        float alpha     = std::get<6>(str.param);
+        float beta      = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "sgemm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_sgemm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_sgemm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        SGemmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','t'),                                      // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(-1.0,  1.0),                                   // beta
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(7)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::SGemmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/gemm/test_gemm.h b/gtestsuite/testsuite/level3/gemm/test_gemm.h
new file mode 100644
index 0000000000..3396ba2ce6
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemm/test_gemm.h
@@ -0,0 +1,80 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "gemm.h"
+#include "level3/ref_gemm.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+
+template<typename T>
+void test_gemm( char storage, char trnsa, char trnsb, gtint_t m, gtint_t n,
+    gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc,
+    T alpha, T beta, double thresh, char datatype ) {
+
+    // Compute the leading dimensions of a, b, and c.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, trnsa, m, k, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, trnsb, k, n, ldb_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldc_inc);
+
+    //----------------------------------------------------------
+    //         Initialize matrics with random numbers
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 8, storage, trnsa, m, k, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(-5, 2, storage, trnsb, k, n, ldb, datatype);
+    std::vector<T> c = testinghelpers::get_random_matrix<T>(-3, 5, storage, 'n', m, n, ldc, datatype);
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> c_ref(c);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    gemm<T>( storage, trnsa, trnsb, m, n, k, &alpha, a.data(), lda,
+                                b.data(), ldb, &beta, c.data(), ldc );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_gemm( storage, trnsa, trnsb, m, n, k, alpha,
+               a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, n, c.data(), c_ref.data(), ldc, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp
new file mode 100644
index 0000000000..0f4bb4783d
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemm/zgemm_generic.cpp
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemm.h"
+
+class ZGemmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ZGemmTest, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<1>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemm<T>(storage, transa, transb, m, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class ZGemmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, gtint_t, dcomplex, dcomplex, gtint_t, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char tsa        = std::get<1>(str.param);
+        char tsb        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t n       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        dcomplex alpha  = std::get<6>(str.param);
+        dcomplex beta   = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zgemm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zgemm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zgemm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ZGemmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','c','t'),                                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(dcomplex{2.0,-1.0}),                           // alpha
+            ::testing::Values(dcomplex{1.0,2.0}),                            // beta
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ZGemmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp
new file mode 100644
index 0000000000..f15fc50619
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemmt/cgemmt_generic.cpp
@@ -0,0 +1,161 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemmt.h"
+
+class cgemmtTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(cgemmtTest);
+
+TEST_P(cgemmtTest, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemmt<T>(storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class cgemmtTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,gtint_t,scomplex,scomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        char tsb        = std::get<3>(str.param);
+        gtint_t n       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        scomplex alpha  = std::get<6>(str.param);
+        scomplex beta   = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cgemmt_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cgemmt";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_cgemmt";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(n);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+// Disable tests for BLIS_TYPED case due to compiler errors.
+#ifndef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cgemmtTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uplo u:upper, l:lower
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','c','t'),                                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(scomplex{2.0,-1.0}),                           // alpha
+            ::testing::Values(scomplex{1.0,2.0}),                            // beta
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cgemmtTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp
new file mode 100644
index 0000000000..b27b6c66b9
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemmt/dgemmt_generic.cpp
@@ -0,0 +1,159 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemmt.h"
+
+class dgemmtTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dgemmtTest);
+
+TEST_P(dgemmtTest, RandomData) {
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemmt<T>(storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class dgemmtTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,gtint_t,double,double,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char tsa        = std::get<1>(str.param);
+        char tsb        = std::get<2>(str.param);
+        char uplo       = std::get<3>(str.param);
+        gtint_t n       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        double alpha    = std::get<6>(str.param);
+        double beta     = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dgemmt_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dgemmt";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_dgemmt";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + std::to_string(n);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+// Disable tests for BLIS_TYPED case due to compiler errors.
+#ifndef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dgemmtTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uplo u:upper, l:lower
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','c','t'),                                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(2.0),                                          // alpha
+            ::testing::Values(3.0),                                          // beta
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dgemmtTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level3/gemmt/gemmt.h b/gtestsuite/testsuite/level3/gemmt/gemmt.h
new file mode 100644
index 0000000000..217cd5bcd0
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemmt/gemmt.h
@@ -0,0 +1,175 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        C := alpha*op( A )*op( B ) + beta*C,
+ * where  op( A ) is one of
+ *        op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H.
+ * Only accesses and updates the upper or the lower triangular part.
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     side   specifies if the symmetric matrix A appears left or right in
+                         the matrix multiplication
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     transb specifies the form of op( B ) to be used in
+                         the matrix multiplication
+ * @param[in]     n      specifies the number  of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     k      specifies  the number of columns of the matrix
+                         op( A ) and the number of rows of the matrix op( B ).
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T>
+static void gemmt_(char uplo, char transa, char transb, gtint_t n, gtint_t k, T* alpha,
+                    T* ap, gtint_t lda,  T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+    if constexpr (std::is_same<T, float>::value)
+        sgemmt_( &uplo, &transa, &transb, &n, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        dgemmt_( &uplo, &transa, &transb, &n, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cgemmt_( &uplo, &transa, &transb, &n, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zgemmt_( &uplo, &transa, &transb, &n, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/gemmt.h: Invalid typename in gemmt_().");
+}
+
+template<typename T>
+static void cblas_gemmt(char storage, char uplo, char transa, char transb,
+    gtint_t n, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_TRANSPOSE cblas_transb;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_trans( transa, &cblas_transa );
+    testinghelpers::char_to_cblas_trans( transb, &cblas_transb );
+    testinghelpers::char_to_cblas_uplo( uplo, &cblas_uplo );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_sgemmt( cblas_order, cblas_uplo, cblas_transa, cblas_transb, n, k, *alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dgemmt( cblas_order, cblas_uplo, cblas_transa, cblas_transb, n, k, *alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_cgemmt( cblas_order, cblas_uplo, cblas_transa, cblas_transb, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zgemmt( cblas_order, cblas_uplo, cblas_transa, cblas_transb, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/gemmt.h: Invalid typename in cblas_gemmt().");
+}
+
+#ifdef TEST_BLIS_TYPED
+template<typename T>
+static void typed_gemmt(char storage, char uplo, char trnsa, char trnsb,
+    gtint_t n, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    trans_t transa, transb;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_trans( trnsa, &transa );
+    testinghelpers::char_to_blis_trans( trnsb, &transb );
+
+    uplo_t blis_uplo;
+    testinghelpers::char_to_blis_uplo( uplo, &blis_uplo );
+
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+    dim_t rsc,csc;
+
+    rsa=rsb=rsc=1;
+    csa=csb=csc=1;
+    /* a = m x k       b = k x n       c = m x n    */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_sgemmt( blis_uplo, transa, transb, n, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dgemmt( blis_uplo, transa, transb, n, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cgemmt( blis_uplo, transa, transb, n, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zgemmt( blis_uplo, transa, transb, n, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/gemmt.h: Invalid typename in typed_gemmt().");
+}
+#endif
+template<typename T>
+static void gemmt( char storage, char uplo, char transa, char transb, gtint_t n, gtint_t k,
+    T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        gemmt_<T>( uplo, transa, transb, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/gemmt.h: BLAS interface cannot be tested for row-major order.");
+
+#elif TEST_CBLAS
+    cblas_gemmt<T>( storage, uplo, transa, transb, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#elif TEST_BLIS_TYPED
+    //typed_gemmt<T>( storage, uplo, transa, transb, n, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    throw std::runtime_error("Error in testsuite/level3/gemmt.h: BLIS-typed interface cannot be tested tested.");
+#else
+    throw std::runtime_error("Error in testsuite/level3/gemmt.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp
new file mode 100644
index 0000000000..c9686e84bb
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemmt/sgemmt_generic.cpp
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemmt.h"
+
+class sgemmtTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(sgemmtTest);
+
+TEST_P(sgemmtTest, RandomData) {
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*n*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemmt<T>(storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class sgemmtTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,gtint_t,float,float,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char tsa        = std::get<1>(str.param);
+        char tsb        = std::get<2>(str.param);
+        char uplo       = std::get<3>(str.param);
+        gtint_t n       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        float alpha     = std::get<6>(str.param);
+        float beta      = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "sgemmt_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_sgemmt";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_sgemmt";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + std::to_string(n);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Disable tests for BLIS_TYPED case due to compiler errors.
+#ifndef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        sgemmtTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uplo u:upper, l:lower
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','c','t'),                                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(2.0),                                          // alpha
+            ::testing::Values(3.0),                                          // beta
+            ::testing::Values(gtint_t(0), gtint_t(7)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::sgemmtTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level3/gemmt/test_gemmt.h b/gtestsuite/testsuite/level3/gemmt/test_gemmt.h
new file mode 100644
index 0000000000..9087c9fa81
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemmt/test_gemmt.h
@@ -0,0 +1,79 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "gemmt.h"
+#include "level3/ref_gemmt.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_gemmt( char storage, char uplo, char trnsa, char trnsb, gtint_t n,
+    gtint_t k, gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc,
+    T alpha, T beta, double thresh, char datatype ) {
+
+    // Compute the leading dimensions of a, b, and c.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, trnsa, n, k, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, trnsb, k, n, ldb_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', n, n, ldc_inc);
+
+    //----------------------------------------------------------
+    //         Initialize matrics with random numbers
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 8, storage, trnsa, n, k, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(-5, 2, storage, trnsb, k, n, ldb, datatype);
+    std::vector<T> c = testinghelpers::get_random_matrix<T>(-3, 5, storage, 'n', n, n, ldc, datatype);
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> c_ref(c);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    gemmt<T>( storage, uplo, trnsa, trnsb, n, k, &alpha, a.data(), lda,
+                                b.data(), ldb, &beta, c.data(), ldc );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_gemmt( storage, uplo, trnsa, trnsb, n, k, alpha,
+               a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, n, n, c.data(), c_ref.data(), ldc, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp b/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp
new file mode 100644
index 0000000000..d5ddd84276
--- /dev/null
+++ b/gtestsuite/testsuite/level3/gemmt/zgemmt_generic.cpp
@@ -0,0 +1,162 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_gemmt.h"
+
+class zgemmtTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(zgemmtTest);
+
+TEST_P(zgemmtTest, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<4>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = std::max(n,k)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_gemmt<T>(storage, uplo, transa, transb, n, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class zgemmtTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char,char,char,char,gtint_t,gtint_t,dcomplex,dcomplex,gtint_t,gtint_t,gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        char tsb        = std::get<3>(str.param);
+        gtint_t n       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        dcomplex alpha  = std::get<6>(str.param);
+        dcomplex beta   = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zgemmt_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zgemmt";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_zgemmt";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(n);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Disable tests for BLIS_TYPED case due to compiler errors.
+#ifndef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zgemmtTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // uplo u:upper, l:lower
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','c','t'),                                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(dcomplex{2.0,-1.0}),                           // alpha
+            ::testing::Values(dcomplex{1.0,2.0}),                            // beta
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(9)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zgemmtTestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp b/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp
new file mode 100644
index 0000000000..4a1221c4b4
--- /dev/null
+++ b/gtestsuite/testsuite/level3/hemm/chemm_generic.cpp
@@ -0,0 +1,162 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_hemm.h"
+
+class chemmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(chemmTest, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the hemmetric matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies if the upper or lower triangular part of A is used
+    char uplo = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // specifies beta value
+    T beta = std::get<8>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    gtint_t ldb_inc = std::get<10>(GetParam());
+    gtint_t ldc_inc = std::get<11>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<12>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_hemm<T>(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class chemmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, scomplex, scomplex, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uplo       = std::get<2>(str.param);
+        char conja      = std::get<3>(str.param);
+        char tsb        = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        scomplex alpha  = std::get<7>(str.param);
+        scomplex beta   = std::get<8>(str.param);
+        gtint_t lda_inc = std::get<9>(str.param);
+        gtint_t ldb_inc = std::get<10>(str.param);
+        gtint_t ldc_inc = std::get<11>(str.param);
+        char datatype   = std::get<12>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "chemm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_chemm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_chemm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uplo;
+        str_name = str_name + "_" + conja + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        chemmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // l:left, r:right
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // transa
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(scomplex{2.0, -1.0}),                          // alpha
+            ::testing::Values(scomplex{-3.0, 2.0}),                          // beta
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::chemmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/hemm/hemm.h b/gtestsuite/testsuite/level3/hemm/hemm.h
new file mode 100644
index 0000000000..1cc0ca1473
--- /dev/null
+++ b/gtestsuite/testsuite/level3/hemm/hemm.h
@@ -0,0 +1,167 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ * For BLIS-typed API:
+ *        C := alpha*conj( A )*trans( B ) + beta*C, if side is left
+ *     or C := alpha*trans( B )*conj( A ) + beta*C, if side is right
+ * For BLAs/CBLAS API:
+ *        C := alpha*A*B + beta*C, if side is left
+ *     or C := alpha*B*A + beta*C, if side is right
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     side   specifies if the hemmetric matrix A appears left or right in
+                         the matrix multiplication
+ * @param[in]     uplo   specifies if the upper or lower triangular part of A is used
+ * @param[in]     conja specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     transb specifies the form of op( B ) to be used in
+                         the matrix multiplication
+ * @param[in]     m      specifies the number of rows and cols of the  matrix
+                         op( A ) and rows of the matrix C and B
+ * @param[in]     n      specifies the number of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T>
+static void hemm_(char side, char uplo, gtint_t m, gtint_t n, T* alpha,
+                    T* ap, gtint_t lda,  T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+    if constexpr (std::is_same<T, scomplex>::value)
+        chemm_( &side, &uplo, &m, &n, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zhemm_( &side, &uplo, &m, &n, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/hemm.h: Invalid typename in hemm_().");
+}
+
+template<typename T>
+static void cblas_hemm(char storage, char side, char uplo,
+    gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_SIDE cblas_side;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_side( side, &cblas_side );
+    testinghelpers::char_to_cblas_uplo( uplo, &cblas_uplo );
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        cblas_chemm( cblas_order, cblas_side, cblas_uplo, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zhemm( cblas_order, cblas_side, cblas_uplo, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/hemm.h: Invalid typename in cblas_hemm().");
+}
+
+template<typename T>
+static void typed_hemm(char storage, char side, char uplo, char conj_a, char trnsb,
+    gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    conj_t conja;
+    trans_t transb;
+    side_t blis_side;
+    uplo_t blis_uplo;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_a, &conja );
+    testinghelpers::char_to_blis_trans( trnsb, &transb );
+    testinghelpers::char_to_blis_uplo( uplo, &blis_uplo );
+    testinghelpers::char_to_blis_side( side, &blis_side );
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+    dim_t rsc,csc;
+
+    rsa=rsb=rsc=1;
+    csa=csb=csc=1;
+    /* a = m x k       b = k x n       c = m x n    */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_shemm( blis_side, blis_uplo, conja, transb, m, n, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dhemm( blis_side, blis_uplo, conja, transb, m, n, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_chemm( blis_side, blis_uplo, conja, transb, m, n, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zhemm( blis_side, blis_uplo, conja, transb, m, n, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/hemm.h: Invalid typename in typed_hemm().");
+}
+
+template<typename T>
+static void hemm( char storage, char side, char uplo, char conja, char transb, gtint_t m, gtint_t n,
+    T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        hemm_<T>( side, uplo, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/hemm.h: BLAS interface cannot be tested for row-major order.");
+
+#elif TEST_CBLAS
+    cblas_hemm<T>( storage, side, uplo, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#elif TEST_BLIS_TYPED
+    typed_hemm<T>( storage, side, uplo, conja, transb, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#else
+    throw std::runtime_error("Error in testsuite/level3/hemm.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/hemm/test_hemm.h b/gtestsuite/testsuite/level3/hemm/test_hemm.h
new file mode 100644
index 0000000000..bae4756f6b
--- /dev/null
+++ b/gtestsuite/testsuite/level3/hemm/test_hemm.h
@@ -0,0 +1,85 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "hemm.h"
+#include "level3/ref_hemm.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_hemm( char storage, char side, char uplo, char conja, char transb,
+    gtint_t m, gtint_t n,
+    gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc,
+    T alpha, T beta,
+    double thresh, char datatype
+) {
+    // Set the dimension for row/col of A, depending on the value of side.
+    gtint_t k = ((side == 'l')||(side == 'L'))? m : n;
+    // Compute the leading dimensions of a, b, and c.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, 'n', k, k, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldb_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldc_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    // Since matrix A, stored in a, is symmetric and we only use the upper or lower
+    // part in the computation of hemm and zero-out the rest to ensure
+    // that code operates as expected.
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-5, 2, storage, uplo, k, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(-5, 2, storage, transb, m, n, ldb, datatype);
+    std::vector<T> c = testinghelpers::get_random_matrix<T>(-3, 5, storage, 'n', m, n, ldc, datatype);
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> c_ref(c);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    hemm<T>( storage, side, uplo, conja, transb, m, n, &alpha, a.data(), lda,
+                                b.data(), ldb, &beta, c.data(), ldc );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_hemm( storage, side, uplo, conja, transb, m, n, alpha,
+               a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, n, c.data(), c_ref.data(), ldc, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp b/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp
new file mode 100644
index 0000000000..4ebc75ef2c
--- /dev/null
+++ b/gtestsuite/testsuite/level3/hemm/zhemm_generic.cpp
@@ -0,0 +1,162 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_hemm.h"
+
+class zhemmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zhemmTest, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the hemmetric matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies if the upper or lower triangular part of A is used
+    char uplo = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // specifies beta value
+    T beta = std::get<8>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    gtint_t ldb_inc = std::get<10>(GetParam());
+    gtint_t ldc_inc = std::get<11>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<12>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_hemm<T>(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class zhemmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, dcomplex, dcomplex, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uplo       = std::get<2>(str.param);
+        char conja      = std::get<3>(str.param);
+        char tsb        = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        dcomplex alpha  = std::get<7>(str.param);
+        dcomplex beta   = std::get<8>(str.param);
+        gtint_t lda_inc = std::get<9>(str.param);
+        gtint_t ldb_inc = std::get<10>(str.param);
+        gtint_t ldc_inc = std::get<11>(str.param);
+        char datatype   = std::get<12>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zhemm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zhemm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zhemm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uplo;
+        str_name = str_name + "_" + conja + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zhemmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // l:left, r:right
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // transa
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(dcomplex{-2.0, 3.0}),                          // alpha
+            ::testing::Values(dcomplex{4.0, -1.0}),                          // beta
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(6)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zhemmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp b/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp
new file mode 100644
index 0000000000..b33db3a187
--- /dev/null
+++ b/gtestsuite/testsuite/level3/her2k/cher2k_generic.cpp
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_her2k.h"
+
+class cher2kTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(cher2kTest, RandomData) {
+    using T = scomplex;
+    using RT = typename testinghelpers::type_info<T>::real_type;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<4>(GetParam());
+    // matrix size n
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    RT beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_her2k<T>(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class cher2kTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, gtint_t, gtint_t, scomplex, float, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        char tsb        = std::get<3>(str.param);
+        gtint_t m       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        scomplex alpha  = std::get<6>(str.param);
+        float beta      = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cher2k_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cher2k";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_cher2k";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cher2kTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // transa
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(-3.0, 2.0),                                    // beta
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cher2kTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/her2k/her2k.h b/gtestsuite/testsuite/level3/her2k/her2k.h
new file mode 100644
index 0000000000..76ea95f3b4
--- /dev/null
+++ b/gtestsuite/testsuite/level3/her2k/her2k.h
@@ -0,0 +1,158 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        C := alpha*A*B**T + alpha*B*A**T + beta*C
+ *     or C := alpha*A**T*B + alpha*B**T*A + beta*C
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     uplo   specifies if the upper or lower triangular part of A is used
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     transb specifies the form of op( B ) to be used in
+                         the matrix multiplication
+ * @param[in]     m      specifies the number of rows and cols of the  matrix
+                         op( A ) and rows of the matrix C and B
+ * @param[in]     k      specifies the number of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+static void her2k_(char uplo, char transa, gtint_t m, gtint_t k, T* alpha,
+                    T* ap, gtint_t lda, T* bp, gtint_t ldb, RT* beta, T* cp, gtint_t ldc )
+{
+    if constexpr (std::is_same<T, scomplex>::value)
+        cher2k_( &uplo, &transa, &m, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zher2k_( &uplo, &transa, &m, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/her2k.h: Invalid typename in her2k_().");
+}
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+static void cblas_her2k(char storage, char uplo, char transa,
+    gtint_t m, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, RT* beta, T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+    enum CBLAS_TRANSPOSE cblas_transa;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uplo, &cblas_uplo );
+    testinghelpers::char_to_cblas_trans( transa, &cblas_transa );
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        cblas_cher2k( cblas_order, cblas_uplo, cblas_transa, m, k, alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zher2k( cblas_order, cblas_uplo, cblas_transa, m, k, alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/her2k.h: Invalid typename in cblas_her2k().");
+}
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+static void typed_her2k(char storage, char uplo, char trnsa, char trnsb,
+    gtint_t m, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, RT* beta, T* cp, gtint_t ldc)
+{
+    trans_t transa, transb;
+    uplo_t blis_uplo;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_trans( trnsa, &transa );
+    testinghelpers::char_to_blis_trans( trnsb, &transb );
+    testinghelpers::char_to_blis_uplo( uplo, &blis_uplo );
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+    dim_t rsc,csc;
+
+    rsa=rsb=rsc=1;
+    csa=csb=csc=1;
+    /* a = m x k       b = k x n       c = m x n    */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_sher2k( blis_uplo, transa, transb, m, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dher2k( blis_uplo, transa, transb, m, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cher2k( blis_uplo, transa, transb, m, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zher2k( blis_uplo, transa, transb, m, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/her2k.h: Invalid typename in typed_her2k().");
+}
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+static void her2k( char storage, char uplo, char transa, char transb, gtint_t m, gtint_t k,
+    T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, RT* beta, T* cp, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        her2k_<T>( uplo, transa, m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/her2k.h: BLAS interface cannot be tested for row-major order.");
+
+#elif TEST_CBLAS
+    cblas_her2k<T>( storage, uplo, transa, m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#elif TEST_BLIS_TYPED
+    typed_her2k<T>( storage, uplo, transa, transb, m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#else
+    throw std::runtime_error("Error in testsuite/level3/her2k.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/her2k/test_her2k.h b/gtestsuite/testsuite/level3/her2k/test_her2k.h
new file mode 100644
index 0000000000..60c1f1c2f0
--- /dev/null
+++ b/gtestsuite/testsuite/level3/her2k/test_her2k.h
@@ -0,0 +1,84 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "her2k.h"
+#include "level3/ref_her2k.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+void test_her2k( char storage, char uplo, char transa, char transb,
+    gtint_t m, gtint_t k,
+    gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc,
+    T alpha, RT beta,
+    double thresh, char datatype
+) {
+    // Compute the leading dimensions of a, b, and c.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, m, k, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, transb, m, k, ldb_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, m, ldc_inc);
+
+    //----------------------------------------------------------
+    //         Initialize matrics with random numbers
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 8, storage, transa, m, k, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(-5, 2, storage, transb, m, k, ldb, datatype);
+    // Since matrix C, stored in c, is symmetric and we only use the upper or lower
+    // part in the computation of her2k and zero-out the rest to ensure
+    // that code operates as expected.
+    std::vector<T> c = testinghelpers::get_random_matrix<T>(-3, 5, storage, uplo, m, ldc, datatype);
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> c_ref(c);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    her2k<T>( storage, uplo, transa, transb, m, k, &alpha, a.data(), lda,
+                                b.data(), ldb, &beta, c.data(), ldc );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_her2k( storage, uplo, transa, transb, m, k, &alpha,
+               a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, m, c.data(), c_ref.data(), ldc, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp b/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp
new file mode 100644
index 0000000000..95301a291b
--- /dev/null
+++ b/gtestsuite/testsuite/level3/her2k/zher2k_generic.cpp
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_her2k.h"
+
+class zher2kTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zher2kTest, RandomData) {
+    using T = dcomplex;
+    using RT = typename testinghelpers::type_info<T>::real_type;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<4>(GetParam());
+    // matrix size n
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    RT beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_her2k<T>(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class zher2kTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, gtint_t, gtint_t, dcomplex, double, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        char tsb        = std::get<3>(str.param);
+        gtint_t m       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        dcomplex alpha  = std::get<6>(str.param);
+        double beta     = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zher2k_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zher2k";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zher2k";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zher2kTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // transa
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{2.0, -1.0}, dcomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(4.0, -1.0),                                    // beta
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::zher2kTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/herk/cherk_generic.cpp b/gtestsuite/testsuite/level3/herk/cherk_generic.cpp
new file mode 100644
index 0000000000..13252de9cd
--- /dev/null
+++ b/gtestsuite/testsuite/level3/herk/cherk_generic.cpp
@@ -0,0 +1,146 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_herk.h"
+
+class cherkTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(cherkTest, RandomData) {
+    using T = scomplex;
+    using RT = typename testinghelpers::type_info<T>::real_type;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<4>(GetParam());
+    // specifies alpha value
+    RT alpha = std::get<5>(GetParam());
+    // specifies beta value
+    RT beta = std::get<6>(GetParam());
+    // lda, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    gtint_t ldc_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_herk<T>(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class cherkTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, float, float, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t k       = std::get<4>(str.param);
+        float alpha     = std::get<5>(str.param);
+        float beta      = std::get<6>(str.param);
+        gtint_t lda_inc = std::get<7>(str.param);
+        gtint_t ldc_inc = std::get<8>(str.param);
+        char datatype   = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "cherk_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_cherk";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_cherk";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        cherkTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n','c'),                                      // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(-2.0, 3.0),                                    // alpha
+            ::testing::Values(4.0, -1.0),                                    // beta
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::cherkTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/herk/herk.h b/gtestsuite/testsuite/level3/herk/herk.h
new file mode 100644
index 0000000000..6aab4355dc
--- /dev/null
+++ b/gtestsuite/testsuite/level3/herk/herk.h
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        C := alpha*A*A**H + beta*C
+ *     or C := alpha*A**H*A + beta*C
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     uplo   specifies if the upper or lower triangular part of C is used
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     n      specifies the number of rows and cols of C
+ * @param[in]     k      specifies the number of rows of A, in case of transa = 'C',
+ *                       and the columns of A otherwise.
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+static void herk_(char uplo, char transa, gtint_t m, gtint_t k, RT* alpha,
+                    T* ap, gtint_t lda,  RT* beta, T* cp, gtint_t ldc )
+{
+    if constexpr (std::is_same<T, scomplex>::value)
+        cherk_( &uplo, &transa, &m, &k, alpha, ap, &lda, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zherk_( &uplo, &transa, &m, &k, alpha, ap, &lda, beta, cp, &ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/herk.h: Invalid typename in herk_().");
+}
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+static void cblas_herk(char storage, char uplo, char trnsa,
+    gtint_t m, gtint_t k, RT* alpha, T* ap, gtint_t lda,
+    RT* beta, T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+    enum CBLAS_TRANSPOSE cblas_transa;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uplo, &cblas_uplo );
+    testinghelpers::char_to_cblas_trans( trnsa, &cblas_transa );
+
+    if constexpr (std::is_same<T, scomplex>::value)
+        cblas_cherk( cblas_order, cblas_uplo, cblas_transa, m, k, *alpha, ap, lda, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zherk( cblas_order, cblas_uplo, cblas_transa, m, k, *alpha, ap, lda, *beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/herk.h: Invalid typename in cblas_herk().");
+}
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+static void typed_herk(char storage, char uplo, char trnsa,
+    gtint_t m, gtint_t k, RT* alpha, T* ap, gtint_t lda,
+    RT* beta, T* cp, gtint_t ldc)
+{
+    trans_t transa;
+    uplo_t blis_uplo;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_trans( trnsa, &transa );
+    testinghelpers::char_to_blis_uplo( uplo, &blis_uplo );
+    dim_t rsa,csa;
+    dim_t rsc,csc;
+
+    rsa=rsc=1;
+    csa=csc=1;
+    /* a = m x k   c = m x m    */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_sherk( blis_uplo, transa, m, k, alpha, ap, rsa, csa, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dherk( blis_uplo, transa, m, k, alpha, ap, rsa, csa, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cherk( blis_uplo, transa, m, k, alpha, ap, rsa, csa, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zherk( blis_uplo, transa, m, k, alpha, ap, rsa, csa, beta, cp, rsc, csc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/herk.h: Invalid typename in typed_herk().");
+}
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+static void herk( char storage, char uplo, char transa, gtint_t m, gtint_t k,
+    RT* alpha, T* ap, gtint_t lda, RT* beta, T* cp, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        herk_<T>( uplo, transa, m, k, alpha, ap, lda, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/herk.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_herk<T>( storage, uplo, transa, m, k, alpha, ap, lda, beta, cp, ldc );
+#elif TEST_BLIS_TYPED
+    typed_herk<T>( storage, uplo, transa, m, k, alpha, ap, lda, beta, cp, ldc );
+#else
+    throw std::runtime_error("Error in testsuite/level3/herk.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/herk/test_herk.h b/gtestsuite/testsuite/level3/herk/test_herk.h
new file mode 100644
index 0000000000..355b514ec4
--- /dev/null
+++ b/gtestsuite/testsuite/level3/herk/test_herk.h
@@ -0,0 +1,82 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "herk.h"
+#include "level3/ref_herk.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T, typename RT = typename testinghelpers::type_info<T>::real_type>
+void test_herk( char storage, char uplo, char transa,
+    gtint_t m, gtint_t k,
+    gtint_t lda_inc, gtint_t ldc_inc,
+    RT alpha, RT beta,
+    double thresh, char datatype
+) {
+
+    // Compute the leading dimensions of a, b, and c.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, m, k, lda_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, m, ldc_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-5, 2, storage, transa, m, k, lda, datatype);
+    // Since matrix C, stored in c, is symmetric, we only use the upper or lower
+    // part in the computation of herk and zero-out the rest to ensure
+    // that code operates as expected.
+    std::vector<T> c = testinghelpers::get_random_matrix<T>(-8, 12, storage, uplo, m, ldc, datatype);
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> c_ref(c);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    herk<T>( storage, uplo, transa, m, k, &alpha, a.data(), lda,
+                &beta, c.data(), ldc );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_herk<T>( storage, uplo, transa, m, k, alpha,
+               a.data(), lda, beta, c_ref.data(), ldc );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, m, c.data(), c_ref.data(), ldc, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/herk/zherk_generic.cpp b/gtestsuite/testsuite/level3/herk/zherk_generic.cpp
new file mode 100644
index 0000000000..3bbe6cf334
--- /dev/null
+++ b/gtestsuite/testsuite/level3/herk/zherk_generic.cpp
@@ -0,0 +1,146 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_herk.h"
+
+class zherkTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zherkTest, RandomData) {
+    using T = dcomplex;
+    using RT = typename testinghelpers::type_info<T>::real_type;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<4>(GetParam());
+    // specifies alpha value
+    RT alpha = std::get<5>(GetParam());
+    // specifies beta value
+    RT beta = std::get<6>(GetParam());
+    // lda, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    gtint_t ldc_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_herk<T>(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class zherkTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, double, double, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t k       = std::get<4>(str.param);
+        double alpha    = std::get<5>(str.param);
+        double beta     = std::get<6>(str.param);
+        gtint_t lda_inc = std::get<7>(str.param);
+        gtint_t ldc_inc = std::get<8>(str.param);
+        char datatype   = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zherk_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zherk";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zherk";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zherkTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),
+            ::testing::Values('u','l'),                                      // storage format
+            ::testing::Values('n','c'),                                      // u:upper, l:lower
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Values(2.0, -1.0),                                    // n
+            ::testing::Values(-3.0, 2.0),                                    // alpha
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // beta
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // increment to the leading dim of b
+        ),                                                                   // i : integer, f : float  datatype type tested
+        ::zherkTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/symm/csymm_generic.cpp b/gtestsuite/testsuite/level3/symm/csymm_generic.cpp
new file mode 100644
index 0000000000..96c53c63df
--- /dev/null
+++ b/gtestsuite/testsuite/level3/symm/csymm_generic.cpp
@@ -0,0 +1,163 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_symm.h"
+
+class csymmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(csymmTest, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the symmetric matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies if the upper or lower triangular part of A is used
+    char uplo = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // specifies beta value
+    T beta = std::get<8>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    gtint_t ldb_inc = std::get<10>(GetParam());
+    gtint_t ldc_inc = std::get<11>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<12>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_symm<T>(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class csymmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, scomplex, scomplex, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uplo       = std::get<2>(str.param);
+        char conja      = std::get<3>(str.param);
+        char tsb        = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        scomplex alpha  = std::get<7>(str.param);
+        scomplex beta   = std::get<8>(str.param);
+        gtint_t lda_inc = std::get<9>(str.param);
+        gtint_t ldb_inc = std::get<10>(str.param);
+        gtint_t ldc_inc = std::get<11>(str.param);
+        char datatype   = std::get<12>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "csymm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_csymm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_csymm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uplo;
+        str_name = str_name + "_" + conja + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        csymmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // l:left, r:right
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // conja
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(scomplex{-3.0, 2.0}, scomplex{4.0, -1.0}),     // beta
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : dcomplex  datatype type tested
+        ),
+        ::csymmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp b/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp
new file mode 100644
index 0000000000..9217152a22
--- /dev/null
+++ b/gtestsuite/testsuite/level3/symm/dsymm_generic.cpp
@@ -0,0 +1,161 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_symm.h"
+
+class dsymmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dsymmTest, RandomData) {
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the symmetric matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies if the upper or lower triangular part of A is used
+    char uplo = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // specifies beta value
+    T beta = std::get<8>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    gtint_t ldb_inc = std::get<10>(GetParam());
+    gtint_t ldc_inc = std::get<11>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<12>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 30*m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_symm<T>(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class dsymmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, double, double, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uplo       = std::get<2>(str.param);
+        char conja      = std::get<3>(str.param);
+        char tsb        = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        double alpha    = std::get<7>(str.param);
+        double beta     = std::get<8>(str.param);
+        gtint_t lda_inc = std::get<9>(str.param);
+        gtint_t ldb_inc = std::get<10>(str.param);
+        gtint_t ldc_inc = std::get<11>(str.param);
+        char datatype   = std::get<12>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dsymm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dsymm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dsymm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uplo;
+        str_name = str_name + "_" + conja + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dsymmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // l:left, r:right
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // conja
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(-1.0,  1.0),                                   // beta
+            ::testing::Values(gtint_t(0), gtint_t(6)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : dcomplex  datatype type tested
+        ),
+        ::dsymmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp b/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp
new file mode 100644
index 0000000000..1fca984ee7
--- /dev/null
+++ b/gtestsuite/testsuite/level3/symm/ssymm_generic.cpp
@@ -0,0 +1,161 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_symm.h"
+
+class ssymmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ssymmTest, RandomData) {
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the symmetric matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies if the upper or lower triangular part of A is used
+    char uplo = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // specifies beta value
+    T beta = std::get<8>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    gtint_t ldb_inc = std::get<10>(GetParam());
+    gtint_t ldc_inc = std::get<11>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<12>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 8*m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_symm<T>(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class ssymmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, float, float, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uplo       = std::get<2>(str.param);
+        char conja      = std::get<3>(str.param);
+        char tsb        = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        float alpha     = std::get<7>(str.param);
+        float beta      = std::get<8>(str.param);
+        gtint_t lda_inc = std::get<9>(str.param);
+        gtint_t ldb_inc = std::get<10>(str.param);
+        gtint_t ldc_inc = std::get<11>(str.param);
+        char datatype   = std::get<12>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ssymm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ssymm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ssymm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uplo;
+        str_name = str_name + "_" + conja + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ssymmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // l:left, r:right
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // conja
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(-1.0,  1.0),                                   // beta
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(9)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : dcomplex  datatype type tested
+        ),
+        ::ssymmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/symm/symm.h b/gtestsuite/testsuite/level3/symm/symm.h
new file mode 100644
index 0000000000..cc97c9304f
--- /dev/null
+++ b/gtestsuite/testsuite/level3/symm/symm.h
@@ -0,0 +1,175 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ * For BLIS-typed API:
+ *        C := alpha*conj( A )*trans( B ) + beta*C, if side is left
+ *     or C := alpha*trans( B )*conj( A ) + beta*C, if side is right
+ * For BLAs/CBLAS API:
+ *        C := alpha*A*B + beta*C, if side is left
+ *     or C := alpha*B*A + beta*C, if side is right
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     side   specifies if the symmetric matrix A appears left or right in
+                         the matrix multiplication
+ * @param[in]     uplo   specifies if the upper or lower triangular part of A is used
+ * @param[in]     conja specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     transb specifies the form of op( B ) to be used in
+                         the matrix multiplication
+ * @param[in]     m      specifies the number of rows and cols of the  matrix
+                         op( A ) and rows of the matrix C and B
+ * @param[in]     n      specifies the number of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T>
+static void symm_(char side, char uplo, gtint_t m, gtint_t n, T* alpha,
+                    T* ap, gtint_t lda,  T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+    if constexpr (std::is_same<T, float>::value)
+        ssymm_( &side, &uplo, &m, &n, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        dsymm_( &side, &uplo, &m, &n, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        csymm_( &side, &uplo, &m, &n, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zsymm_( &side, &uplo, &m, &n, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/symm.h: Invalid typename in symm_().");
+}
+
+template<typename T>
+static void cblas_symm(char storage, char side, char uplo,
+    gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_SIDE cblas_side;
+    enum CBLAS_UPLO cblas_uplo;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_side( side, &cblas_side );
+    testinghelpers::char_to_cblas_uplo( uplo, &cblas_uplo );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_ssymm( cblas_order, cblas_side, cblas_uplo, m, n, *alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dsymm( cblas_order, cblas_side, cblas_uplo, m, n, *alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_csymm( cblas_order, cblas_side, cblas_uplo, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zsymm( cblas_order, cblas_side, cblas_uplo, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/symm.h: Invalid typename in cblas_symm().");
+}
+
+template<typename T>
+static void typed_symm(char storage, char side, char uplo, char conj_a, char trnsb,
+    gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    conj_t conja;
+    trans_t transb;
+    side_t blis_side;
+    uplo_t blis_uplo;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_conj( conj_a, &conja );
+    testinghelpers::char_to_blis_trans( trnsb, &transb );
+    testinghelpers::char_to_blis_uplo( uplo, &blis_uplo );
+    testinghelpers::char_to_blis_side( side, &blis_side );
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+    dim_t rsc,csc;
+
+    rsa=rsb=rsc=1;
+    csa=csb=csc=1;
+    /* a = m x k       b = k x n       c = m x n    */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_ssymm( blis_side, blis_uplo, conja, transb, m, n, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dsymm( blis_side, blis_uplo, conja, transb, m, n, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_csymm( blis_side, blis_uplo, conja, transb, m, n, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zsymm( blis_side, blis_uplo, conja, transb, m, n, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/symm.h: Invalid typename in typed_symm().");
+}
+
+template<typename T>
+static void symm( char storage, char side, char uplo, char conja, char transb, gtint_t m, gtint_t n,
+    T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        symm_<T>( side, uplo, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/symm.h: BLAS interface cannot be tested for row-major order.");
+
+#elif TEST_CBLAS
+    cblas_symm<T>( storage, side, uplo, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#elif TEST_BLIS_TYPED
+    typed_symm<T>( storage, side, uplo, conja, transb, m, n, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#else
+    throw std::runtime_error("Error in testsuite/level3/symm.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/symm/test_symm.h b/gtestsuite/testsuite/level3/symm/test_symm.h
new file mode 100644
index 0000000000..4274067b72
--- /dev/null
+++ b/gtestsuite/testsuite/level3/symm/test_symm.h
@@ -0,0 +1,87 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "symm.h"
+#include "level3/ref_symm.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_symm( char storage, char side, char uplo, char conja, char transb,
+    gtint_t m, gtint_t n,
+    gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc,
+    T alpha, T beta,
+    double thresh, char datatype
+) {
+
+    // Set the dimension for row/col of A, depending on the value of side.
+    gtint_t k = ((side == 'l')||(side == 'L'))? m : n;
+    // Compute the leading dimensions of a, b, and c.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, conja, k, k, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, transb, m, n, ldb_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldc_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    // Since matrix A, stored in a, is symmetric and we only use the upper or lower
+    // part in the computation of hemm and zero-out the rest to ensure
+    // that code operates as expected.
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-5, 2, storage, uplo, k, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(-5, 2, storage, transb, m, n, ldb, datatype);
+    std::vector<T> c = testinghelpers::get_random_matrix<T>(-3, 5, storage, 'n', m, n, ldc, datatype);
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> c_ref(c);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    symm<T>( storage, side, uplo, conja, transb, m, n, &alpha, a.data(), lda,
+                                b.data(), ldb, &beta, c.data(), ldc );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_symm( storage, side, uplo, conja, transb, m, n, alpha,
+               a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, n, c.data(), c_ref.data(), ldc, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp b/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp
new file mode 100644
index 0000000000..9585a8915b
--- /dev/null
+++ b/gtestsuite/testsuite/level3/symm/zsymm_generic.cpp
@@ -0,0 +1,163 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_symm.h"
+
+class zsymmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zsymmTest, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the symmetric matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies if the upper or lower triangular part of A is used
+    char uplo = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c
+    char conja = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // specifies beta value
+    T beta = std::get<8>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<9>(GetParam());
+    gtint_t ldb_inc = std::get<10>(GetParam());
+    gtint_t ldc_inc = std::get<11>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<12>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_symm<T>(storage, side, uplo, conja, transb, m, n, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class zsymmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, dcomplex, dcomplex, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uplo       = std::get<2>(str.param);
+        char conja      = std::get<3>(str.param);
+        char tsb        = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        dcomplex alpha  = std::get<7>(str.param);
+        dcomplex beta   = std::get<8>(str.param);
+        gtint_t lda_inc = std::get<9>(str.param);
+        gtint_t ldb_inc = std::get<10>(str.param);
+        gtint_t ldc_inc = std::get<11>(str.param);
+        char datatype   = std::get<12>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zsymm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zsymm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zsymm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uplo;
+        str_name = str_name + "_" + conja + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zsymmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // l:left, r:right
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // conja
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{2.0, -1.0}, dcomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(dcomplex{-3.0, 2.0}, dcomplex{4.0, -1.0}),     // beta
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : dcomplex  datatype type tested
+        ),
+        ::zsymmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp
new file mode 100644
index 0000000000..6b359496a3
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syr2k/csyr2k_generic.cpp
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syr2k.h"
+
+class csyr2kTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(csyr2kTest, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<4>(GetParam());
+    // matrix size n
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syr2k<T>(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class csyr2kTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, gtint_t, gtint_t, scomplex, scomplex, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        char tsb        = std::get<3>(str.param);
+        gtint_t m       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        scomplex alpha  = std::get<6>(str.param);
+        scomplex beta   = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "csyr2k_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_csyr2k";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_csyr2k";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        csyr2kTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),
+            ::testing::Values('u','l'),                                      // storage format
+            ::testing::Values('n'),                                          // u:upper, l:lower
+            ::testing::Values('n'),                                          // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // n
+            ::testing::Values(scomplex{-3.0, 2.0}, scomplex{4.0, -1.0}),     // alpha
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // beta
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // increment to the leading dim of c
+        ),                                                                   // i : integer, f : dcomplex  datatype type tested
+        ::csyr2kTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp
new file mode 100644
index 0000000000..39110773f3
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syr2k/dsyr2k_generic.cpp
@@ -0,0 +1,155 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syr2k.h"
+
+class dsyr2kTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dsyr2kTest, RandomData) {
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<4>(GetParam());
+    // matrix size n
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syr2k<T>(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class dsyr2kTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, gtint_t, gtint_t, double, double, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        char tsb        = std::get<3>(str.param);
+        gtint_t m       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        double alpha    = std::get<6>(str.param);
+        double beta     = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dsyr2k_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dsyr2k";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dsyr2k";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dsyr2kTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),
+            ::testing::Values('u','l'),                                      // storage format
+            ::testing::Values('n'),                                          // u:upper, l:lower
+            ::testing::Values('n'),                                          // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Values( 1.0, -2.0),                                   // n
+            ::testing::Values(-1.0,  1.0),                                   // alpha
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // beta
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(7)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // increment to the leading dim of c
+        ),                                                                   // i : integer, f : dcomplex  datatype type tested
+        ::dsyr2kTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp
new file mode 100644
index 0000000000..ad6f883606
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syr2k/ssyr2k_generic.cpp
@@ -0,0 +1,155 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syr2k.h"
+
+class ssyr2kTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ssyr2kTest, RandomData) {
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<4>(GetParam());
+    // matrix size n
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 10*m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syr2k<T>(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class ssyr2kTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, gtint_t, gtint_t, float, float, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        char tsb        = std::get<3>(str.param);
+        gtint_t m       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        float alpha     = std::get<6>(str.param);
+        float beta      = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ssyr2k_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ssyr2k";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ssyr2k";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ssyr2kTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),
+            ::testing::Values('u','l'),                                      // storage format
+            ::testing::Values('n'),                                          // u:upper, l:lower
+            ::testing::Values('n'),                                          // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Values( 1.0, -2.0),                                   // n
+            ::testing::Values(-1.0,  1.0),                                   // alpha
+            ::testing::Values(gtint_t(0), gtint_t(7)),                       // beta
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // increment to the leading dim of c
+        ),                                                                   // i : integer, f : dcomplex  datatype type tested
+        ::ssyr2kTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/syr2k/syr2k.h b/gtestsuite/testsuite/level3/syr2k/syr2k.h
new file mode 100644
index 0000000000..58b59923e5
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syr2k/syr2k.h
@@ -0,0 +1,166 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        C := alpha*A*B**T + alpha*B*A**T + beta*C
+ *     or C := alpha*A**T*B + alpha*B**T*A + beta*C
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     uplo   specifies if the upper or lower triangular part of A is used
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     transb specifies the form of op( B ) to be used in
+                         the matrix multiplication
+ * @param[in]     m      specifies the number of rows and cols of the  matrix
+                         op( A ) and rows of the matrix C and B
+ * @param[in]     k      specifies the number of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T>
+static void syr2k_(char uplo, char transa, gtint_t m, gtint_t k, T* alpha,
+                    T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+    if constexpr (std::is_same<T, float>::value)
+        ssyr2k_( &uplo, &transa, &m, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        dsyr2k_( &uplo, &transa, &m, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        csyr2k_( &uplo, &transa, &m, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zsyr2k_( &uplo, &transa, &m, &k, alpha, ap, &lda, bp, &ldb, beta, cp, &ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/syr2k.h: Invalid typename in syr2k_().");
+}
+
+template<typename T>
+static void cblas_syr2k(char storage, char uplo, char transa,
+    gtint_t m, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+    enum CBLAS_TRANSPOSE cblas_transa;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uplo, &cblas_uplo );
+    testinghelpers::char_to_cblas_trans( transa, &cblas_transa );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_ssyr2k( cblas_order, cblas_uplo, cblas_transa, m, k, *alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dsyr2k( cblas_order, cblas_uplo, cblas_transa, m, k, *alpha, ap, lda, bp, ldb, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_csyr2k( cblas_order, cblas_uplo, cblas_transa, m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zsyr2k( cblas_order, cblas_uplo, cblas_transa, m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/syr2k.h: Invalid typename in cblas_syr2k().");
+}
+
+template<typename T>
+static void typed_syr2k(char storage, char uplo, char trnsa, char trnsb,
+    gtint_t m, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc)
+{
+    trans_t transa, transb;
+    uplo_t blis_uplo;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_trans( trnsa, &transa );
+    testinghelpers::char_to_blis_trans( trnsb, &transb );
+    testinghelpers::char_to_blis_uplo( uplo, &blis_uplo );
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+    dim_t rsc,csc;
+
+    rsa=rsb=rsc=1;
+    csa=csb=csc=1;
+    /* a = m x k       b = k x n       c = m x n    */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_ssyr2k( blis_uplo, transa, transb, m, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dsyr2k( blis_uplo, transa, transb, m, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_csyr2k( blis_uplo, transa, transb, m, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zsyr2k( blis_uplo, transa, transb, m, k, alpha, ap, rsa, csa, bp, rsb, csb, beta, cp, rsc, csc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/syr2k.h: Invalid typename in typed_syr2k().");
+}
+
+template<typename T>
+static void syr2k( char storage, char uplo, char transa, char transb, gtint_t m, gtint_t k,
+    T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb, T* beta, T* cp, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        syr2k_<T>( uplo, transa, m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/syr2k.h: BLAS interface cannot be tested for row-major order.");
+
+#elif TEST_CBLAS
+    cblas_syr2k<T>( storage, uplo, transa, m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#elif TEST_BLIS_TYPED
+    typed_syr2k<T>( storage, uplo, transa, transb, m, k, alpha, ap, lda, bp, ldb, beta, cp, ldc );
+#else
+    throw std::runtime_error("Error in testsuite/level3/syr2k.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/syr2k/test_syr2k.h b/gtestsuite/testsuite/level3/syr2k/test_syr2k.h
new file mode 100644
index 0000000000..9a7fb82b6f
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syr2k/test_syr2k.h
@@ -0,0 +1,84 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "syr2k.h"
+#include "level3/ref_syr2k.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_syr2k( char storage, char uplo, char transa, char transb,
+    gtint_t m, gtint_t k,
+    gtint_t lda_inc, gtint_t ldb_inc, gtint_t ldc_inc,
+    T alpha, T beta,
+    double thresh, char datatype
+) {
+    // Compute the leading dimensions of a, b, and c.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, m, k, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, transb, m, k, ldb_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, m, ldc_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 8, storage, transa, m, k, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(-5, 2, storage, transb, m, k, ldb, datatype);
+    // Since matrix C, stored in c, is symmetric and we only use the upper or lower
+    // part in the computation of her2k and zero-out the rest to ensure
+    // that code operates as expected.
+    std::vector<T> c = testinghelpers::get_random_matrix<T>(-3, 5, storage, uplo, m, ldc, datatype);
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> c_ref(c);
+
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    syr2k<T>( storage, uplo, transa, transb, m, k, &alpha, a.data(), lda,
+                                b.data(), ldb, &beta, c.data(), ldc );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_syr2k( storage, uplo, transa, transb, m, k, alpha,
+               a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, m, c.data(), c_ref.data(), ldc, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp b/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp
new file mode 100644
index 0000000000..9b0d018768
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syr2k/zsyr2k_generic.cpp
@@ -0,0 +1,157 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syr2k.h"
+
+class zsyr2kTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zsyr2kTest, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c
+    char transa = std::get<2>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<3>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<4>(GetParam());
+    // matrix size n
+    gtint_t k  = std::get<5>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<6>(GetParam());
+    // specifies beta value
+    T beta = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    gtint_t ldc_inc = std::get<10>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<11>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syr2k<T>(storage, uplo, transa, transb, m, k, lda_inc, ldb_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class zsyr2kTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, gtint_t, gtint_t, dcomplex, dcomplex, gtint_t, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        char tsb        = std::get<3>(str.param);
+        gtint_t m       = std::get<4>(str.param);
+        gtint_t k       = std::get<5>(str.param);
+        dcomplex alpha  = std::get<6>(str.param);
+        dcomplex beta   = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        gtint_t ldc_inc = std::get<10>(str.param);
+        char datatype   = std::get<11>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zsyr2k_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zsyr2k";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zsyr2k";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa + tsb;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zsyr2kTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n'),                                          // transa
+            ::testing::Values('n'),                                          // transb
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{2.0, -1.0}, dcomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(dcomplex{-3.0, 2.0}, dcomplex{4.0, -1.0}),     // beta
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of b
+            ::testing::Values(gtint_t(0), gtint_t(6)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : dcomplex  datatype type tested
+        ),
+        ::zsyr2kTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp
new file mode 100644
index 0000000000..092235019e
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syrk/csyrk_generic.cpp
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syrk.h"
+
+class csyrkTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(csyrkTest, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // lda, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    gtint_t ldc_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh =  m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syrk<T>(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class csyrkTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, scomplex, scomplex, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t k       = std::get<4>(str.param);
+        scomplex alpha  = std::get<5>(str.param);
+        scomplex beta   = std::get<6>(str.param);
+        gtint_t lda_inc = std::get<7>(str.param);
+        gtint_t ldc_inc = std::get<8>(str.param);
+        char datatype   = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "csyrk_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_csyrk";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_csyrk";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        csyrkTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(scomplex{2.0, -1.0}, scomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(scomplex{-3.0, 2.0}, scomplex{4.0, -1.0}),     // beta
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : dcomplex  datatype type tested
+        ),
+        ::csyrkTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp
new file mode 100644
index 0000000000..af5d263e5c
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syrk/dsyrk_generic.cpp
@@ -0,0 +1,145 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syrk.h"
+
+class dsyrkTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dsyrkTest, RandomData) {
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // lda, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    gtint_t ldc_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh =  m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syrk<T>(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class dsyrkTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, double, double, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t k       = std::get<4>(str.param);
+        double alpha    = std::get<5>(str.param);
+        double beta     = std::get<6>(str.param);
+        gtint_t lda_inc = std::get<7>(str.param);
+        gtint_t ldc_inc = std::get<8>(str.param);
+        char datatype   = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dsyrk_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dsyrk";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dsyrk";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dsyrkTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n','t','c'),                                  // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(-1.0,  1.0),                                   // beta
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(9)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : dcomplex  datatype type tested
+        ),
+        ::dsyrkTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp
new file mode 100644
index 0000000000..a413c6f15c
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syrk/ssyrk_generic.cpp
@@ -0,0 +1,145 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syrk.h"
+
+class ssyrkTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ssyrkTest, RandomData) {
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // lda, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    gtint_t ldc_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh =  m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syrk<T>(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class ssyrkTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, float, float, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t k       = std::get<4>(str.param);
+        float alpha     = std::get<5>(str.param);
+        float beta      = std::get<6>(str.param);
+        gtint_t lda_inc = std::get<7>(str.param);
+        gtint_t ldc_inc = std::get<8>(str.param);
+        char datatype   = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ssyrk_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ssyrk";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ssyrk";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ssyrkTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),
+            ::testing::Values('u','l'),                                      // storage format
+            ::testing::Values('n','t','c'),                                  // u:upper, l:lower
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Values( 1.0, -2.0),                                   // k
+            ::testing::Values(-1.0,  1.0),                                   // alpha
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // beta
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of a
+            ::testing::Values(ELEMENT_TYPE)                                  // increment to the leading dim of c
+        ),                                                                   // i : integer, f : dcomplex  datatype type tested
+        ::ssyrkTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/syrk/syrk.h b/gtestsuite/testsuite/level3/syrk/syrk.h
new file mode 100644
index 0000000000..ecbea4725e
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syrk/syrk.h
@@ -0,0 +1,156 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ * For BLIS-typed API:
+ *        C := alpha*A*A**T + beta*C
+ *     or C := alpha*A**T*A + beta*C
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     uplo   specifies if the upper or lower triangular part of C is used
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     n      specifies the number of rows and cols of C
+ * @param[in]     k      specifies the number of rows of A, in case of transa = 'C',
+ *                       and the columns of A otherwise.
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T>
+static void syrk_(char uplo, char transa, gtint_t m, gtint_t k, T* alpha,
+                    T* ap, gtint_t lda,  T* beta, T* cp, gtint_t ldc )
+{
+    if constexpr (std::is_same<T, float>::value)
+        ssyrk_( &uplo, &transa, &m, &k, alpha, ap, &lda, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        dsyrk_( &uplo, &transa, &m, &k, alpha, ap, &lda, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        csyrk_( &uplo, &transa, &m, &k, alpha, ap, &lda, beta, cp, &ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        zsyrk_( &uplo, &transa, &m, &k, alpha, ap, &lda, beta, cp, &ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/syrk.h: Invalid typename in syrk_().");
+}
+
+template<typename T>
+static void cblas_syrk(char storage, char uplo, char trnsa,
+    gtint_t m, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* beta, T* cp, gtint_t ldc)
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_UPLO cblas_uplo;
+    enum CBLAS_TRANSPOSE cblas_transa;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_uplo( uplo, &cblas_uplo );
+    testinghelpers::char_to_cblas_trans( trnsa, &cblas_transa );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_ssyrk( cblas_order, cblas_uplo, cblas_transa, m, k, *alpha, ap, lda, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dsyrk( cblas_order, cblas_uplo, cblas_transa, m, k, *alpha, ap, lda, *beta, cp, ldc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_csyrk( cblas_order, cblas_uplo, cblas_transa, m, k, alpha, ap, lda, beta, cp, ldc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_zsyrk( cblas_order, cblas_uplo, cblas_transa, m, k, alpha, ap, lda, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/syrk.h: Invalid typename in cblas_syrk().");
+}
+
+template<typename T>
+static void typed_syrk(char storage, char uplo, char trnsa,
+    gtint_t m, gtint_t k, T* alpha, T* ap, gtint_t lda,
+    T* beta, T* cp, gtint_t ldc)
+{
+    trans_t transa;
+    uplo_t blis_uplo;
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_trans( trnsa, &transa );
+    testinghelpers::char_to_blis_uplo( uplo, &blis_uplo );
+    dim_t rsa,csa;
+    dim_t rsc,csc;
+
+    rsa=rsc=1;
+    csa=csc=1;
+    /* a = m x k   c = m x m    */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_ssyrk( blis_uplo, transa, m, k, alpha, ap, rsa, csa, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dsyrk( blis_uplo, transa, m, k, alpha, ap, rsa, csa, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_csyrk( blis_uplo, transa, m, k, alpha, ap, rsa, csa, beta, cp, rsc, csc );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_zsyrk( blis_uplo, transa, m, k, alpha, ap, rsa, csa, beta, cp, rsc, csc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/syrk.h: Invalid typename in typed_syrk().");
+}
+
+template<typename T>
+static void syrk( char storage, char uplo, char transa, gtint_t m, gtint_t k,
+    T* alpha, T* ap, gtint_t lda, T* beta, T* cp, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        syrk_<T>( uplo, transa, m, k, alpha, ap, lda, beta, cp, ldc );
+    else
+        throw std::runtime_error("Error in testsuite/level3/syrk.h: BLAS interface cannot be tested for row-major order.");
+#elif TEST_CBLAS
+    cblas_syrk<T>( storage, uplo, transa, m, k, alpha, ap, lda, beta, cp, ldc );
+#elif TEST_BLIS_TYPED
+    typed_syrk<T>( storage, uplo, transa, m, k, alpha, ap, lda, beta, cp, ldc );
+#else
+    throw std::runtime_error("Error in testsuite/level3/syrk.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/syrk/test_syrk.h b/gtestsuite/testsuite/level3/syrk/test_syrk.h
new file mode 100644
index 0000000000..9c8585e64a
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syrk/test_syrk.h
@@ -0,0 +1,79 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "syrk.h"
+#include "level3/ref_syrk.h"
+#include "inc/check_error.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_syrk( char storage, char uplo, char transa,
+    gtint_t m, gtint_t k,
+    gtint_t lda_inc, gtint_t ldc_inc,
+    T alpha, T beta,
+    double thresh, char datatype
+) {
+    // Compute the leading dimensions of a, b, and c.
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, m, k, lda_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, m, ldc_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random integer numbers.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>( -2, 8, storage, transa, m, k, lda, datatype );
+    // Since matrix C, stored in c, is symmetric, we only use the upper or lower
+    // part in the computation of syrk and zero-out the rest to ensure
+    // that code operates as expected.
+    std::vector<T> c = testinghelpers::get_random_matrix<T>( -3, 5, storage, uplo, m, ldc, datatype );
+
+    // Create a copy of c so that we can check reference results.
+    std::vector<T> c_ref(c);
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    syrk<T>( storage, uplo, transa, m, k, &alpha, a.data(), lda,
+                &beta, c.data(), ldc );
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_syrk<T>( storage, uplo, transa, m, k, alpha,
+               a.data(), lda, beta, c_ref.data(), ldc );
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, m, c.data(), c_ref.data(), ldc, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp b/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp
new file mode 100644
index 0000000000..7bb7d9cedf
--- /dev/null
+++ b/gtestsuite/testsuite/level3/syrk/zsyrk_generic.cpp
@@ -0,0 +1,147 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_syrk.h"
+
+class zsyrkTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(zsyrkTest, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies if the upper or lower triangular part of C is used
+    char uplo = std::get<1>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<2>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<3>(GetParam());
+    // matrix size k
+    gtint_t k  = std::get<4>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<5>(GetParam());
+    // specifies beta value
+    T beta = std::get<6>(GetParam());
+    // lda, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<7>(GetParam());
+    gtint_t ldc_inc = std::get<8>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<9>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*k*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_syrk<T>(storage, uplo, transa, m, k, lda_inc, ldc_inc, alpha, beta, thresh, datatype);
+}
+
+class zsyrkTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, gtint_t, gtint_t, dcomplex, dcomplex, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char uplo       = std::get<1>(str.param);
+        char tsa        = std::get<2>(str.param);
+        gtint_t m       = std::get<3>(str.param);
+        gtint_t k       = std::get<4>(str.param);
+        dcomplex alpha  = std::get<5>(str.param);
+        dcomplex beta   = std::get<6>(str.param);
+        gtint_t lda_inc = std::get<7>(str.param);
+        gtint_t ldc_inc = std::get<8>(str.param);
+        char datatype   = std::get<9>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "zsyrk_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_zsyrk";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_zsyrk";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + uplo;
+        str_name = str_name + "_" + tsa;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(k);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        zsyrkTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('u','l'),                                      // u:upper, l:lower
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // k
+            ::testing::Values(dcomplex{2.0, -1.0}, dcomplex{-2.0, 3.0}),     // alpha
+            ::testing::Values(dcomplex{-3.0, 2.0}, dcomplex{4.0, -1.0}),     // beta
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : dcomplex  datatype type tested
+        ),
+        ::zsyrkTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp
new file mode 100644
index 0000000000..a875f77282
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm/ctrmm_generic.cpp
@@ -0,0 +1,150 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmm.h"
+
+class ctrmmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ctrmmTest, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmm<T>( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype );
+}
+
+class ctrmmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, scomplex, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char diaga      = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        scomplex alpha  = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        char datatype   = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ctrmm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ctrmm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ctrmm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ctrmmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{2.0,-1.0}),                           // alpha
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ctrmmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp
new file mode 100644
index 0000000000..94fb07ba3c
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm/dtrmm_generic.cpp
@@ -0,0 +1,149 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmm.h"
+
+class dtrmmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dtrmmTest, RandomData) {
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmm<T>( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype );
+}
+
+class dtrmmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, double, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char diaga      = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        double alpha    = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        char datatype   = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dtrmm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dtrmm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dtrmm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dtrmmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dtrmmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp
new file mode 100644
index 0000000000..df2287c90a
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm/strmm_generic.cpp
@@ -0,0 +1,149 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmm.h"
+
+class strmmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(strmmTest, RandomData) {
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 20*m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmm<T>( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype );
+}
+
+class strmmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, float, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char diaga      = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        float alpha     = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        char datatype   = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "strmm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_strmm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_strmm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        strmmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::strmmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/trmm/test_trmm.h b/gtestsuite/testsuite/level3/trmm/test_trmm.h
new file mode 100644
index 0000000000..1993127bae
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm/test_trmm.h
@@ -0,0 +1,78 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "trmm.h"
+#include "level3/ref_trmm.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_trmm( char storage, char side, char uploa, char transa,
+    char diaga, gtint_t m, gtint_t n, T alpha, gtint_t lda_inc,
+    gtint_t ldb_inc, double thresh, char datatype ) {
+
+    gtint_t mn;
+    testinghelpers::set_dim_with_side( side, m, n, &mn );
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, mn, mn, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldb_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random values.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 8, storage, transa, mn, mn, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(-5, 2, storage, 'n', m, n, ldb, datatype);
+
+    // Create a copy of v so that we can check reference results.
+    std::vector<T> b_ref(b);
+
+    mktrim<T>( storage, uploa, mn, a.data(), lda );
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    trmm<T>( storage, side, uploa, transa, diaga, m, n, &alpha, a.data(), lda, b.data(), ldb );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_trmm( storage, side, uploa, transa, diaga, m, n, alpha, a.data(), lda, b_ref.data(), ldb );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, n, b.data(), b_ref.data(), ldb, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/trmm/trmm.h b/gtestsuite/testsuite/level3/trmm/trmm.h
new file mode 100644
index 0000000000..267aa41e7e
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm/trmm.h
@@ -0,0 +1,170 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        op( A )*X = alpha*B,   or   X*op( A ) = alpha*B,
+ * where  op( A ) is one of
+ *        op( A ) = A   or   op( A ) = A**T,
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     side   specifies if the symmetric matrix A appears left or right in
+                         the matrix multiplication
+ * @param[in]     uplo   specifies if the upper or lower triangular part of A is used
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     diaga  specifies whether upper or lower triangular part of the matrix A
+ * @param[in]     m      specifies the number of rows and cols of the  matrix
+                         op( A ) and rows of the matrix C and B
+ * @param[in]     n      specifies the number of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in,out] bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ */
+
+template<typename T>
+static void trmm_( char side, char uploa, char transa, char diaga, gtint_t m,
+               gtint_t n, T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb )
+{
+    if constexpr (std::is_same<T, float>::value)
+        strmm_( &side, &uploa, &transa, &diaga, &m, &n, alpha, ap, &lda, bp, &ldb );
+    else if constexpr (std::is_same<T, double>::value)
+        dtrmm_( &side, &uploa, &transa, &diaga, &m, &n, alpha, ap, &lda, bp, &ldb );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        ctrmm_( &side, &uploa, &transa, &diaga, &m, &n, alpha, ap, &lda, bp, &ldb );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        ztrmm_( &side, &uploa, &transa, &diaga, &m, &n, alpha, ap, &lda, bp, &ldb );
+    else
+        throw std::runtime_error("Error in testsuite/level3/trmm.h: Invalid typename in trmm_().");
+}
+
+template<typename T>
+static void cblas_trmm( char storage, char side, char uploa, char transa,
+    char diaga, gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_SIDE cblas_side;
+    enum CBLAS_UPLO cblas_uploa;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_DIAG cblas_diaga;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_side( side, &cblas_side );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uploa );
+    testinghelpers::char_to_cblas_trans( transa, &cblas_transa );
+    testinghelpers::char_to_cblas_diag( diaga, &cblas_diaga );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_strmm( cblas_order, cblas_side, cblas_uploa, cblas_transa, cblas_diaga, m, n, *alpha, ap, lda, bp, ldb );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dtrmm( cblas_order, cblas_side, cblas_uploa, cblas_transa, cblas_diaga, m, n, *alpha, ap, lda, bp, ldb );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_ctrmm( cblas_order, cblas_side, cblas_uploa, cblas_transa, cblas_diaga, m, n, alpha, ap, lda, bp, ldb );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_ztrmm( cblas_order, cblas_side, cblas_uploa, cblas_transa, cblas_diaga, m, n, alpha, ap, lda, bp, ldb );
+    else
+        throw std::runtime_error("Error in testsuite/level3/trmm.h: Invalid typename in cblas_trmm().");
+}
+
+template<typename T>
+static void typed_trmm( char storage, char side, char uplo, char trans,
+    char diag, gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb )
+{
+    side_t  sidea;
+    uplo_t  uploa;
+    trans_t transa;
+    diag_t  diaga;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_side( side, &sidea );
+    testinghelpers::char_to_blis_uplo( uplo, &uploa );
+    testinghelpers::char_to_blis_trans( trans, &transa );
+    testinghelpers::char_to_blis_diag( diag, &diaga );
+
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+
+    rsa=rsb=1;
+    csa=csb=1;
+    /* a = m x m       b = m x n  */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_strmm( sidea, uploa, transa, diaga, m, n, alpha, ap, rsa, csa, bp, rsb, csb );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dtrmm( sidea, uploa, transa, diaga, m, n, alpha, ap, rsa, csa, bp, rsb, csb );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_ctrmm( sidea, uploa, transa, diaga, m, n, alpha, ap, rsa, csa, bp, rsb, csb );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_ztrmm( sidea, uploa, transa, diaga, m, n, alpha, ap, rsa, csa, bp, rsb, csb );
+    else
+        throw std::runtime_error("Error in testsuite/level3/trmm.h: Invalid typename in typed_trmm().");
+}
+
+template<typename T>
+static void trmm( char storage, char side, char uploa, char transa, char diaga,
+    gtint_t m, gtint_t n, T *alpha, T *ap, gtint_t lda, T *bp, gtint_t ldb )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        trmm_<T>( side, uploa, transa, diaga, m, n, alpha, ap, lda, bp, ldb );
+    else
+        throw std::runtime_error("Error in testsuite/level3/trmm.h: BLAS interface cannot be tested for row-major order.");
+
+#elif TEST_CBLAS
+    cblas_trmm<T>( storage, side, uploa, transa, diaga, m, n, alpha, ap, lda, bp, ldb );
+#elif TEST_BLIS_TYPED
+    typed_trmm<T>( storage, side, uploa, transa, diaga, m, n, alpha, ap, lda, bp, ldb );
+#else
+    throw std::runtime_error("Error in testsuite/level3/trmm.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp b/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp
new file mode 100644
index 0000000000..823f9fdcf3
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm/ztrmm_generic.cpp
@@ -0,0 +1,150 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmm.h"
+
+class ztrmmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ztrmmTest, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmm<T>( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype );
+}
+
+class ztrmmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, dcomplex, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char diaga      = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        dcomplex alpha  = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        char datatype   = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ztrmm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ztrmm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ztrmm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ztrmmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{1.0,2.0}),                            // alpha
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(1)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ztrmmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp
new file mode 100644
index 0000000000..a10d9866ef
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm3/ctrmm3_generic.cpp
@@ -0,0 +1,162 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmm3.h"
+
+class ctrmm3Test :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ctrmm3Test);
+
+TEST_P(ctrmm3Test, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<5>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<6>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<7>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<8>(GetParam());
+    // specifies alpha value
+    T beta = std::get<9>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<10>(GetParam());
+    gtint_t ldb_inc = std::get<11>(GetParam());
+    gtint_t ldc_inc = std::get<12>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<13>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmm3<T>( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh, datatype );
+}
+
+class ctrmm3TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, char, gtint_t, gtint_t, scomplex, scomplex, gtint_t, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char transb     = std::get<4>(str.param);
+        char diaga      = std::get<5>(str.param);
+        gtint_t m       = std::get<6>(str.param);
+        gtint_t n       = std::get<7>(str.param);
+        scomplex alpha  = std::get<8>(str.param);
+        scomplex beta   = std::get<9>(str.param);
+        gtint_t lda_inc = std::get<10>(str.param);
+        gtint_t ldb_inc = std::get<11>(str.param);
+        gtint_t ldc_inc = std::get<12>(str.param);
+        char datatype   = std::get<13>(str.param);
+        std::string str_name = "blis_ctrmm3";
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa + transb;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ctrmm3Test,
+        ::testing::Combine(
+            ::testing::Values('c','r'),                                      // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','t','c'),                                  // transa
+            ::testing::Values('n'),                                          // transb /*transb works only for 'n' case*/
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{2.0,-1.0}),                           // alpha
+            ::testing::Values(scomplex{-1.0,1.0}),                           // beta
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of a
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of b
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ctrmm3TestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp
new file mode 100644
index 0000000000..222d70604e
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm3/dtrmm3_generic.cpp
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmm3.h"
+
+class dtrmm3Test :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(dtrmm3Test);
+
+TEST_P(dtrmm3Test, RandomData) {
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<5>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<6>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<7>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<8>(GetParam());
+    // specifies alpha value
+    T beta = std::get<9>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<10>(GetParam());
+    gtint_t ldb_inc = std::get<11>(GetParam());
+    gtint_t ldc_inc = std::get<12>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<13>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmm3<T>( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh, datatype );
+}
+
+class dtrmm3TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, char, gtint_t, gtint_t, double, double, gtint_t, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char transb     = std::get<4>(str.param);
+        char diaga      = std::get<5>(str.param);
+        gtint_t m       = std::get<6>(str.param);
+        gtint_t n       = std::get<7>(str.param);
+        double alpha    = std::get<8>(str.param);
+        double beta     = std::get<9>(str.param);
+        gtint_t lda_inc = std::get<10>(str.param);
+        gtint_t ldb_inc = std::get<11>(str.param);
+        gtint_t ldc_inc = std::get<12>(str.param);
+        char datatype   = std::get<13>(str.param);
+        std::string str_name = "blis_dtrmm3";
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa + transb;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dtrmm3Test,
+        ::testing::Combine(
+            ::testing::Values('c','r'),                                      // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n'),                                          // transb  /*transb works only for 'n' case*/
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(-1.0,  2.0),                                   // beta
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of a
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of b
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dtrmm3TestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp
new file mode 100644
index 0000000000..df6e4e9bee
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm3/strmm3_generic.cpp
@@ -0,0 +1,160 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmm3.h"
+
+class strmm3Test :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(strmm3Test);
+
+TEST_P(strmm3Test, RandomData) {
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<5>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<6>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<7>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<8>(GetParam());
+    // specifies alpha value
+    T beta = std::get<9>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<10>(GetParam());
+    gtint_t ldb_inc = std::get<11>(GetParam());
+    gtint_t ldc_inc = std::get<12>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<13>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmm3<T>( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh, datatype );
+}
+
+class strmm3TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, char, gtint_t, gtint_t, float, float, gtint_t, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char transb     = std::get<4>(str.param);
+        char diaga      = std::get<5>(str.param);
+        gtint_t m       = std::get<6>(str.param);
+        gtint_t n       = std::get<7>(str.param);
+        float alpha     = std::get<8>(str.param);
+        float beta      = std::get<9>(str.param);
+        gtint_t lda_inc = std::get<10>(str.param);
+        gtint_t ldb_inc = std::get<11>(str.param);
+        gtint_t ldc_inc = std::get<12>(str.param);
+        char datatype   = std::get<13>(str.param);
+        std::string str_name = "blis_strmm3";
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa + transb;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        std::string beta_str = ( beta > 0) ? std::to_string(int(beta)) : "m" + std::to_string(int(std::abs(beta)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        strmm3Test,
+        ::testing::Combine(
+            ::testing::Values('c','r'),                                      // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n'),                                          // transb  /*transb works only for 'n' case*/
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(-1.0,  2.0),                                   // beta
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of a
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of b
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::strmm3TestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level3/trmm3/test_trmm3.h b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h
new file mode 100644
index 0000000000..779f2fef50
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm3/test_trmm3.h
@@ -0,0 +1,81 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "trmm3.h"
+#include "level3/ref_trmm3.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_trmm3( char storage, char side, char uploa, char transa, char diaga,
+  char transb, gtint_t m, gtint_t n, T alpha, gtint_t lda_inc, gtint_t ldb_inc,
+  T beta, gtint_t ldc_inc, double thresh, char datatype ) {
+
+    gtint_t mn;
+    testinghelpers::set_dim_with_side( side, m, n, &mn );
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, mn, mn, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, transb, m, n, ldb_inc);
+    gtint_t ldc = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldc_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random values.
+    //----------------------------------------------------------
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(-2, 8, storage, transa, mn, mn, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(-5, 2, storage, transb, m, n, ldb, datatype);
+    std::vector<T> c = testinghelpers::get_random_matrix<T>(-3, 5, storage, 'n', m, n, ldc, datatype);
+
+    // Create a copy of v so that we can check reference results.
+    std::vector<T> c_ref(c);
+
+    mktrim<T>( storage, uploa, mn, a.data(), lda );
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    trmm3<T>( storage, side, uploa, transa, diaga, transb, m, n, &alpha,
+                    a.data(), lda, b.data(), ldb, &beta, c.data(), ldc );
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_trmm3( storage, side, uploa, transa, diaga, transb,
+          m, n, alpha, a.data(), lda, b.data(), ldb, beta, c_ref.data(), ldc );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, n, c.data(), c_ref.data(), ldb, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/trmm3/trmm3.h b/gtestsuite/testsuite/level3/trmm3/trmm3.h
new file mode 100644
index 0000000000..2bd52db11a
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm3/trmm3.h
@@ -0,0 +1,139 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        C := beta * C_orig + alpha * transa(A) * transb(B)
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     side   specifies if the symmetric matrix A appears left or right in
+                         the matrix multiplication
+ * @param[in]     uplo   specifies if the upper or lower triangular part of A is used
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     diaga  specifies whether upper or lower triangular part of the matrix A
+ * @param[in]     transb specifies the form of op( B ) to be used in
+                         the matrix multiplication
+ * @param[in]     m      specifies the number of rows and cols of the  matrix
+                         op( A ) and rows of the matrix C and B
+ * @param[in]     n      specifies the number of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in]     bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ * @param[in]     beta   specifies the scalar beta.
+ * @param[in,out] cp     specifies pointer which points to the first element of cp
+ * @param[in]     rsc    specifies row increment of cp.
+ * @param[in]     csc    specifies column increment of cp.
+ */
+
+template<typename T>
+static void typed_trmm3( char storage, char side, char uplo, char trnsa,
+    char diag, char trnsb, gtint_t m, gtint_t n, T *alpha, T *a, gtint_t lda,
+    T *b, gtint_t ldb, T *beta, T *c, gtint_t ldc )
+{
+    side_t  sidea;
+    uplo_t  uploa;
+    trans_t transa;
+    trans_t transb;
+    diag_t  diaga;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_side( side, &sidea );
+    testinghelpers::char_to_blis_uplo( uplo, &uploa );
+    testinghelpers::char_to_blis_trans( trnsa, &transa );
+    testinghelpers::char_to_blis_trans( trnsb, &transb );
+    testinghelpers::char_to_blis_diag( diag, &diaga );
+
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+    dim_t rsc,csc;
+
+    rsa=rsb=rsc=1;
+    csa=csb=csc=1;
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+        csc = ldc ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+        rsc = ldc ;
+    }
+
+    if constexpr (std::is_same<T, float>::value) {
+        bli_strmm3( sidea, uploa, transa, diaga, transb, m, n, alpha,
+                      a, rsa, csa, b, rsb, csb, beta, c, rsc, csc );
+    }
+    else if constexpr (std::is_same<T, double>::value) {
+        bli_dtrmm3( sidea, uploa, transa, diaga, transb, m, n, alpha,
+                      a, rsa, csa, b, rsb, csb, beta, c, rsc, csc );
+    }
+    else if constexpr (std::is_same<T, scomplex>::value)  {
+        bli_ctrmm3( sidea, uploa, transa, diaga, transb, m, n, alpha,
+                      a, rsa, csa, b, rsb, csb, beta, c, rsc, csc );
+    }
+    else if constexpr (std::is_same<T, dcomplex>::value)  {
+        bli_ztrmm3( sidea, uploa, transa, diaga, transb, m, n, alpha,
+                      a, rsa, csa, b, rsb, csb, beta, c, rsc, csc );
+    }
+    else
+        throw std::runtime_error("Error in testsuite/level3/trmm3.h: Invalid typename in typed_trmm3().");
+}
+
+template<typename T>
+static void trmm3( char storage, char side, char uploa, char transa, char diaga,
+                  char transb, gtint_t m, gtint_t n, T *alpha, T *ap, gtint_t lda,
+                  T *bp, gtint_t ldb, T *beta, T *c, gtint_t ldc )
+{
+#ifdef TEST_BLAS
+    throw std::runtime_error("Error in testsuite/level3/trmm3.h: BLAS interface is not available.");
+#elif TEST_CBLAS
+    throw std::runtime_error("Error in testsuite/level3/trmm3.h: BLAS interface is not available.");
+#elif TEST_BLIS_TYPED
+    typed_trmm3<T>( storage, side, uploa, transa, diaga, transb, m, n, alpha,
+                                            ap, lda, bp, ldb, beta, c, ldc );
+#else
+    throw std::runtime_error("Error in testsuite/level3/trmm3.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp b/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp
new file mode 100644
index 0000000000..f32c5caab8
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trmm3/ztrmm3_generic.cpp
@@ -0,0 +1,162 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trmm3.h"
+
+class ztrmm3Test :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ztrmm3Test);
+
+TEST_P(ztrmm3Test, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix b is n,c,t,h
+    char transb = std::get<4>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<5>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<6>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<7>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<8>(GetParam());
+    // specifies alpha value
+    T beta = std::get<9>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<10>(GetParam());
+    gtint_t ldb_inc = std::get<11>(GetParam());
+    gtint_t ldc_inc = std::get<12>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<13>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = m*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trmm3<T>( storage, side, uploa, transa, diaga, transb, m, n, alpha, lda_inc, ldb_inc, beta, ldc_inc, thresh, datatype );
+}
+
+class ztrmm3TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, char, gtint_t, gtint_t, dcomplex, dcomplex, gtint_t, gtint_t, gtint_t, char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char transb     = std::get<4>(str.param);
+        char diaga      = std::get<5>(str.param);
+        gtint_t m       = std::get<6>(str.param);
+        gtint_t n       = std::get<7>(str.param);
+        dcomplex alpha  = std::get<8>(str.param);
+        dcomplex beta   = std::get<9>(str.param);
+        gtint_t lda_inc = std::get<10>(str.param);
+        gtint_t ldb_inc = std::get<11>(str.param);
+        gtint_t ldc_inc = std::get<12>(str.param);
+        char datatype   = std::get<13>(str.param);
+        std::string str_name = "blis_ztrmm3";
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa + transb;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        std::string beta_str = ( beta.real > 0) ? std::to_string(int(beta.real)) : ("m" + std::to_string(int(std::abs(beta.real))));
+                    beta_str = beta_str + "pi" + (( beta.imag > 0) ? std::to_string(int(beta.imag)) : ("m" + std::to_string(int(std::abs(beta.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_b" + beta_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + std::to_string(ldc_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+#ifdef TEST_BLIS_TYPED
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ztrmm3Test,
+        ::testing::Combine(
+            ::testing::Values('c','r'),                                      // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','t','c'),                                  // transa
+            ::testing::Values('n'),                                          // transb /*transb works only for 'n' case*/
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(dcomplex{1.0,2.0}),                            // alpha
+            ::testing::Values(dcomplex{2.0,-1.0}),                           // beta
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of a
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of b
+            ::testing::Values(gtint_t(0)),                                   // increment to the leading dim of c
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ztrmm3TestPrint()
+    );
+#endif
diff --git a/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp
new file mode 100644
index 0000000000..d4644da077
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trsm/ctrsm_generic.cpp
@@ -0,0 +1,150 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trsm.h"
+
+class ctrsmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   scomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ctrsmTest, RandomData) {
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = std::max(m, n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trsm<T>( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype );
+}
+
+class ctrsmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, scomplex, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char diaga      = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        scomplex alpha  = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        char datatype   = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ctrsm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ctrsm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ctrsm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ctrsmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(31), 10),                  // n
+            ::testing::Values(scomplex{2.0,-1.0}),                           // alpha
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ctrsmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp
new file mode 100644
index 0000000000..9995ca3c6c
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trsm/dtrsm_generic.cpp
@@ -0,0 +1,149 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trsm.h"
+
+class dtrsmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   double,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(dtrsmTest, RandomData) {
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = std::max(m, n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trsm<T>( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype );
+}
+
+class dtrsmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, double, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char diaga      = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        double alpha    = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        char datatype   = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dtrsm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dtrsm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_dtrsm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dtrsmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(11), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(11), 10),                  // n
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(gtint_t(0), gtint_t(5)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::dtrsmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp
new file mode 100644
index 0000000000..aa69d719ac
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trsm/strsm_generic.cpp
@@ -0,0 +1,149 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trsm.h"
+
+class strsmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   float,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(strsmTest, RandomData) {
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = std::max(m, n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trsm<T>( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype );
+}
+
+class strsmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, float, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char diaga      = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        float alpha     = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        char datatype   = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "strsm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_strsm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_strsm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha > 0) ? std::to_string(int(alpha)) : "m" + std::to_string(int(std::abs(alpha)));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        strsmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','t'),                                      // transa
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(11), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(11), 10),                  // n
+            ::testing::Values( 1.0, -2.0),                                   // alpha
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(4)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::strsmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/level3/trsm/test_trsm.h b/gtestsuite/testsuite/level3/trsm/test_trsm.h
new file mode 100644
index 0000000000..7145a92156
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trsm/test_trsm.h
@@ -0,0 +1,86 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "trsm.h"
+#include "level3/ref_trsm.h"
+#include "inc/check_error.h"
+#include "inc/utils.h"
+#include <stdexcept>
+#include <algorithm>
+
+template<typename T>
+void test_trsm( char storage, char side, char uploa, char transa,
+    char diaga, gtint_t m, gtint_t n, T alpha, gtint_t lda_inc,
+    gtint_t ldb_inc, double thresh, char datatype ) {
+
+    gtint_t mn;
+    testinghelpers::set_dim_with_side( side, m, n, &mn );
+    gtint_t lda = testinghelpers::get_leading_dimension(storage, transa, mn, mn, lda_inc);
+    gtint_t ldb = testinghelpers::get_leading_dimension(storage, 'n', m, n, ldb_inc);
+
+    //----------------------------------------------------------
+    //        Initialize matrics with random values.
+    //----------------------------------------------------------
+    gtint_t lower = (diaga = 'n')||(diaga = 'N') ? 3 : 0;
+    gtint_t upper = (diaga = 'n')||(diaga = 'N') ? 10 : 1;
+    std::vector<T> a = testinghelpers::get_random_matrix<T>(lower, upper, storage, transa, mn, mn, lda, datatype);
+    std::vector<T> b = testinghelpers::get_random_matrix<T>(3, 10, storage, 'n', m, n, ldb, datatype);
+
+    // Making A diagonally dominant so that the condition number is good and
+    // the algorithm doesn't diverge.
+    for (gtint_t i=0; i<mn; i++)
+    {
+        a[i+i*lda] = T{float(mn)}*a[i+i*lda];
+    }
+    // Create a copy of v so that we can check reference results.
+    std::vector<T> b_ref(b);
+
+    mktrim<T>( storage, uploa, mn, a.data(), lda );
+    //----------------------------------------------------------
+    //                  Call BLIS function
+    //----------------------------------------------------------
+    trsm<T>( storage, side, uploa, transa, diaga, m, n, &alpha, a.data(), lda, b.data(), ldb );
+
+    //----------------------------------------------------------
+    //                  Call reference implementation.
+    //----------------------------------------------------------
+    testinghelpers::ref_trsm( storage, side, uploa, transa, diaga, m, n, alpha, a.data(), lda, b_ref.data(), ldb );
+
+    //----------------------------------------------------------
+    //              check component-wise error.
+    //----------------------------------------------------------
+    computediff<T>( storage, m, n, b.data(), b_ref.data(), ldb, thresh );
+}
diff --git a/gtestsuite/testsuite/level3/trsm/trsm.h b/gtestsuite/testsuite/level3/trsm/trsm.h
new file mode 100644
index 0000000000..bb7f0469e2
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trsm/trsm.h
@@ -0,0 +1,170 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Performs the operation:
+ *        B := alpha*op( A )*B,   or   B := alpha*B*op( A )
+ * where  op( A ) is one of
+ *        op( A ) = A   or   op( A ) = A**T   or   op( A ) = A**H,
+ * @param[in]     storage specifies storage format used for the matrices
+ * @param[in]     side   specifies if the symmetric matrix A appears left or right in
+                         the matrix multiplication
+ * @param[in]     uplo   specifies if the upper or lower triangular part of A is used
+ * @param[in]     transa specifies the form of op( A ) to be used in
+                         the matrix multiplication
+ * @param[in]     diaga  specifies whether upper or lower triangular part of the matrix A
+ * @param[in]     m      specifies the number of rows and cols of the  matrix
+                         op( A ) and rows of the matrix C and B
+ * @param[in]     n      specifies the number of columns of the matrix
+                         op( B ) and the number of columns of the matrix C
+ * @param[in]     alpha  specifies the scalar alpha.
+ * @param[in]     ap     specifies pointer which points to the first element of ap
+ * @param[in]     rsa    specifies row increment of ap.
+ * @param[in]     csa    specifies column increment of ap.
+ * @param[in,out] bp     specifies pointer which points to the first element of bp
+ * @param[in]     rsb    specifies row increment of bp.
+ * @param[in]     csb    specifies column increment of bp.
+ */
+
+template<typename T>
+static void trsm_( char side, char uploa, char transa, char diaga, gtint_t m,
+               gtint_t n, T* alpha, T* ap, gtint_t lda, T* bp, gtint_t ldb )
+{
+    if constexpr (std::is_same<T, float>::value)
+        strsm_( &side, &uploa, &transa, &diaga, &m, &n, alpha, ap, &lda, bp, &ldb );
+    else if constexpr (std::is_same<T, double>::value)
+        dtrsm_( &side, &uploa, &transa, &diaga, &m, &n, alpha, ap, &lda, bp, &ldb );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        ctrsm_( &side, &uploa, &transa, &diaga, &m, &n, alpha, ap, &lda, bp, &ldb );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        ztrsm_( &side, &uploa, &transa, &diaga, &m, &n, alpha, ap, &lda, bp, &ldb );
+    else
+        throw std::runtime_error("Error in testsuite/level3/trsm.h: Invalid typename in trsm_().");
+}
+
+template<typename T>
+static void cblas_trsm( char storage, char side, char uploa, char transa,
+    char diaga, gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb )
+{
+    enum CBLAS_ORDER cblas_order;
+    enum CBLAS_SIDE cblas_side;
+    enum CBLAS_UPLO cblas_uploa;
+    enum CBLAS_TRANSPOSE cblas_transa;
+    enum CBLAS_DIAG cblas_diaga;
+
+    testinghelpers::char_to_cblas_order( storage, &cblas_order );
+    testinghelpers::char_to_cblas_side( side, &cblas_side );
+    testinghelpers::char_to_cblas_uplo( uploa, &cblas_uploa );
+    testinghelpers::char_to_cblas_trans( transa, &cblas_transa );
+    testinghelpers::char_to_cblas_diag( diaga, &cblas_diaga );
+
+    if constexpr (std::is_same<T, float>::value)
+        cblas_strsm( cblas_order, cblas_side, cblas_uploa, cblas_transa, cblas_diaga, m, n, *alpha, ap, lda, bp, ldb );
+    else if constexpr (std::is_same<T, double>::value)
+        cblas_dtrsm( cblas_order, cblas_side, cblas_uploa, cblas_transa, cblas_diaga, m, n, *alpha, ap, lda, bp, ldb );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        cblas_ctrsm( cblas_order, cblas_side, cblas_uploa, cblas_transa, cblas_diaga, m, n, alpha, ap, lda, bp, ldb );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        cblas_ztrsm( cblas_order, cblas_side, cblas_uploa, cblas_transa, cblas_diaga, m, n, alpha, ap, lda, bp, ldb );
+    else
+        throw std::runtime_error("Error in testsuite/level3/trsm.h: Invalid typename in cblas_trsm().");
+}
+
+template<typename T>
+static void typed_trsm( char storage, char side, char uplo, char trans,
+    char diag, gtint_t m, gtint_t n, T* alpha, T* ap, gtint_t lda,
+    T* bp, gtint_t ldb )
+{
+    side_t  sidea;
+    uplo_t  uploa;
+    trans_t transa;
+    diag_t  diaga;
+
+    // Map parameter characters to BLIS constants.
+    testinghelpers::char_to_blis_side( side, &sidea );
+    testinghelpers::char_to_blis_uplo( uplo, &uploa );
+    testinghelpers::char_to_blis_trans( trans, &transa );
+    testinghelpers::char_to_blis_diag( diag, &diaga );
+
+    dim_t rsa,csa;
+    dim_t rsb,csb;
+
+    rsa=rsb=1;
+    csa=csb=1;
+    /* a = m x m       b = m x n  */
+    if( (storage == 'c') || (storage == 'C') ) {
+        csa = lda ;
+        csb = ldb ;
+    }
+    else if( (storage == 'r') || (storage == 'R') ) {
+        rsa = lda ;
+        rsb = ldb ;
+    }
+
+    if constexpr (std::is_same<T, float>::value)
+        bli_strsm( sidea, uploa, transa, diaga, m, n, alpha, ap, rsa, csa, bp, rsb, csb );
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dtrsm( sidea, uploa, transa, diaga, m, n, alpha, ap, rsa, csa, bp, rsb, csb );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_ctrsm( sidea, uploa, transa, diaga, m, n, alpha, ap, rsa, csa, bp, rsb, csb );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_ztrsm( sidea, uploa, transa, diaga, m, n, alpha, ap, rsa, csa, bp, rsb, csb );
+    else
+        throw std::runtime_error("Error in testsuite/level3/trsm.h: Invalid typename in typed_trsm().");
+}
+
+template<typename T>
+static void trsm( char storage, char side, char uploa, char transa, char diaga,
+    gtint_t m, gtint_t n, T *alpha, T *ap, gtint_t lda, T *bp, gtint_t ldb )
+{
+#ifdef TEST_BLAS
+    if( storage == 'c' || storage == 'C' )
+        trsm_<T>( side, uploa, transa, diaga, m, n, alpha, ap, lda, bp, ldb );
+    else
+        throw std::runtime_error("Error in testsuite/level3/trsm.h: BLAS interface cannot be tested for row-major order.");
+
+#elif TEST_CBLAS
+    cblas_trsm<T>( storage, side, uploa, transa, diaga, m, n, alpha, ap, lda, bp, ldb );
+#elif TEST_BLIS_TYPED
+    typed_trsm<T>( storage, side, uploa, transa, diaga, m, n, alpha, ap, lda, bp, ldb );
+#else
+    throw std::runtime_error("Error in testsuite/level3/trsm.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp b/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp
new file mode 100644
index 0000000000..1987251fc2
--- /dev/null
+++ b/gtestsuite/testsuite/level3/trsm/ztrsm_generic.cpp
@@ -0,0 +1,150 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_trsm.h"
+
+class ztrsmTest :
+        public ::testing::TestWithParam<std::tuple<char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   char,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   dcomplex,
+                                                   gtint_t,
+                                                   gtint_t,
+                                                   char>> {};
+
+TEST_P(ztrsmTest, RandomData) {
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // matrix storage format(row major, column major)
+    char storage = std::get<0>(GetParam());
+    // specifies matrix A appears left or right in
+    // the matrix multiplication
+    char side = std::get<1>(GetParam());
+    // specifies upper or lower triangular part of A is used
+    char uploa = std::get<2>(GetParam());
+    // denotes whether matrix a is n,c,t,h
+    char transa = std::get<3>(GetParam());
+    // denotes whether matrix a in unit or non-unit diagonal
+    char diaga = std::get<4>(GetParam());
+    // matrix size m
+    gtint_t m  = std::get<5>(GetParam());
+    // matrix size n
+    gtint_t n  = std::get<6>(GetParam());
+    // specifies alpha value
+    T alpha = std::get<7>(GetParam());
+    // lda, ldb, ldc increments.
+    // If increments are zero, then the array size matches the matrix size.
+    // If increments are nonnegative, the array size is bigger than the matrix size.
+    gtint_t lda_inc = std::get<8>(GetParam());
+    gtint_t ldb_inc = std::get<9>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype   = std::get<10>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = std::max(m, n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_trsm<T>( storage, side, uploa, transa, diaga, m, n, alpha, lda_inc, ldb_inc, thresh, datatype );
+}
+
+class ztrsmTestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<char, char, char, char, char, gtint_t, gtint_t, dcomplex, gtint_t, gtint_t,char>> str) const {
+        char sfm        = std::get<0>(str.param);
+        char side       = std::get<1>(str.param);
+        char uploa      = std::get<2>(str.param);
+        char transa     = std::get<3>(str.param);
+        char diaga      = std::get<4>(str.param);
+        gtint_t m       = std::get<5>(str.param);
+        gtint_t n       = std::get<6>(str.param);
+        dcomplex alpha  = std::get<7>(str.param);
+        gtint_t lda_inc = std::get<8>(str.param);
+        gtint_t ldb_inc = std::get<9>(str.param);
+        char datatype   = std::get<10>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "ztrsm_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_ztrsm";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "blis_ztrsm";
+#endif
+        str_name = str_name + "_" + sfm+sfm+sfm;
+        str_name = str_name + "_" + side + uploa + transa;
+        str_name = str_name + "_d" + diaga;
+        str_name = str_name + "_" + std::to_string(m);
+        str_name = str_name + "_" + std::to_string(n);
+        std::string alpha_str = ( alpha.real > 0) ? std::to_string(int(alpha.real)) : ("m" + std::to_string(int(std::abs(alpha.real))));
+                    alpha_str = alpha_str + "pi" + (( alpha.imag > 0) ? std::to_string(int(alpha.imag)) : ("m" + std::to_string(int(std::abs(alpha.imag)))));
+        str_name = str_name + "_a" + alpha_str;
+        str_name = str_name + "_" + std::to_string(lda_inc);
+        str_name = str_name + "_" + std::to_string(ldb_inc);
+        str_name = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        ztrsmTest,
+        ::testing::Combine(
+            ::testing::Values('c'
+#ifndef TEST_BLAS
+            ,'r'
+#endif
+            ),                                                               // storage format
+            ::testing::Values('l','r'),                                      // side  l:left, r:right
+            ::testing::Values('u','l'),                                      // uplo  u:upper, l:lower
+            ::testing::Values('n','c','t'),                                  // transa
+            ::testing::Values('n','u'),                                      // diaga , n=nonunit u=unit
+            ::testing::Range(gtint_t(10), gtint_t(11), 10),                  // m
+            ::testing::Range(gtint_t(10), gtint_t(11), 10),                  // n
+            ::testing::Values(dcomplex{1.0,2.0}),                            // alpha
+            ::testing::Values(gtint_t(0), gtint_t(2)),                       // increment to the leading dim of a
+            ::testing::Values(gtint_t(0), gtint_t(3)),                       // increment to the leading dim of b
+            ::testing::Values(ELEMENT_TYPE)                                  // i : integer, f : float  datatype type tested
+        ),
+        ::ztrsmTestPrint()
+    );
diff --git a/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp
new file mode 100644
index 0000000000..a020075f2c
--- /dev/null
+++ b/gtestsuite/testsuite/util/nrm2/cnrm2_generic.cpp
@@ -0,0 +1,101 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_nrm2.h"
+
+class CNrm2Test :
+        public ::testing::TestWithParam<std::tuple<gtint_t, gtint_t, char>> {};
+
+TEST_P( CNrm2Test, RandomData )
+{
+    using T = scomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<1>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<2>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = std::sqrt(n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_nrm2<T>(n, incx, thresh, datatype);
+}
+
+// Prints the test case combination
+class CNrm2TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t, gtint_t, char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        gtint_t incx  = std::get<1>(str.param);
+        char datatype = std::get<2>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "scnrm2_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_scnrm2";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_cnormfv";
+#endif
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        CNrm2Test,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1), gtint_t(2)
+#ifndef TEST_BLIS_TYPED
+            , gtint_t(-1), gtint_t(-2)
+#endif
+        ),                                                                   // stride size for x
+            ::testing::Values('i')                                           // i : integer, f : float  datatype type tested
+        ),
+        ::CNrm2TestPrint()
+    );
diff --git a/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp
new file mode 100644
index 0000000000..245b5f49ac
--- /dev/null
+++ b/gtestsuite/testsuite/util/nrm2/dnrm2_generic.cpp
@@ -0,0 +1,101 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_nrm2.h"
+
+class dnrm2Test :
+        public ::testing::TestWithParam<std::tuple<gtint_t, gtint_t, char>> {};
+
+TEST_P( dnrm2Test, RandomData )
+{
+    using T = double;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<1>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<2>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = std::sqrt(n)*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_nrm2<T>(n, incx, thresh, datatype);
+}
+
+// Prints the test case combination
+class dnrm2TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t, gtint_t, char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        gtint_t incx  = std::get<1>(str.param);
+        char datatype = std::get<2>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dnrm2_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dnrm2";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_dnormfv";
+#endif
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        dnrm2Test,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1), gtint_t(2)
+#ifndef TEST_BLIS_TYPED
+            ,gtint_t(-1), gtint_t(-2)
+#endif
+        ),                                                                   // stride size for x
+            ::testing::Values('i')                                           // i : integer, f : float  datatype type tested
+        ),
+        ::dnrm2TestPrint()
+    );
diff --git a/gtestsuite/testsuite/util/nrm2/nrm2.h b/gtestsuite/testsuite/util/nrm2/nrm2.h
new file mode 100644
index 0000000000..9d54d51f65
--- /dev/null
+++ b/gtestsuite/testsuite/util/nrm2/nrm2.h
@@ -0,0 +1,106 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "blis.h"
+#include "common/testing_helpers.h"
+
+/**
+ * @brief Overload bli_*normfv() functions using typed_nrm2.
+ *        Will be used in testing and especially in TYPED_TESTs.
+ *        Computes the Euclidean norm of x.
+ * @param[in] n vector length
+ * @param[in] x pointer which points to the first element of x
+ * @param[in] incx increment of x
+ * @return the Euclidean norm of x
+ */
+
+template<typename T, typename Treal>
+static Treal nrm2_(gtint_t n, T* x, gtint_t incx){
+    if constexpr (std::is_same<T, float>::value)
+        return snrm2_( &n, x, &incx );
+    else if constexpr (std::is_same<T, double>::value)
+        return dnrm2_( &n, x, &incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        return scnrm2_( &n, x, &incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        return dznrm2_( &n, x, &incx );
+    else
+      throw std::runtime_error("Error in testsuite/level1/nrm2.h: Invalid typename in nrm2_().");
+}
+
+template<typename T, typename Treal>
+static Treal cblas_nrm2(gtint_t n, T* x, gtint_t incx){
+    if constexpr (std::is_same<T, float>::value)
+        return cblas_snrm2( n, x, incx );
+    else if constexpr (std::is_same<T, double>::value)
+        return cblas_dnrm2( n, x, incx );
+    else if constexpr (std::is_same<T, scomplex>::value)
+        return cblas_scnrm2( n, x, incx );
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        return cblas_dznrm2( n, x, incx );
+    else
+      throw std::runtime_error("Error in testsuite/level1/nrm2.h: Invalid typename in cblas_nrm2().");
+}
+
+template<typename T, typename Treal>
+static Treal typed_nrm2(gtint_t n, T* x, gtint_t incx){
+    Treal nrm;
+    if constexpr (std::is_same<T, float>::value)
+        bli_snormfv(n, x, incx, &nrm);
+    else if constexpr (std::is_same<T, double>::value)
+        bli_dnormfv(n, x, incx, &nrm);
+    else if constexpr (std::is_same<T, scomplex>::value)
+        bli_cnormfv(n, x, incx, &nrm);
+    else if constexpr (std::is_same<T, dcomplex>::value)
+        bli_znormfv(n, x, incx, &nrm);
+    else
+      throw std::runtime_error("Error in testsuite/level1/nrm2.h: Invalid typename in cblas_nrm2().");
+    return nrm;
+}
+
+template<typename T, typename Treal>
+static Treal nrm2(gtint_t n, T* x, gtint_t incx)
+{
+#ifdef TEST_BLAS
+    return nrm2_<T, Treal>(n, x, incx);
+#elif TEST_CBLAS
+    return cblas_nrm2<T, Treal>(n, x, incx);
+#elif TEST_BLIS_TYPED
+    return typed_nrm2<T, Treal>(n, x, incx);
+#else
+    throw std::runtime_error("Error in testsuite/level1/axpyv.h: No interfaces are set to be tested.");
+#endif
+}
diff --git a/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp b/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp
new file mode 100644
index 0000000000..5bd2bb46e6
--- /dev/null
+++ b/gtestsuite/testsuite/util/nrm2/nrm2_extreme_vals.cpp
@@ -0,0 +1,77 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_nrm2.h"
+
+template <typename T>
+class xnrm2 : public ::testing::Test {};
+typedef ::testing::Types<float, double> TypeParam;
+TYPED_TEST_SUITE(xnrm2, TypeParam);
+
+TYPED_TEST(xnrm2, zeroFP) {
+    using T = TypeParam;
+    T x = T(0);
+
+    T norm = nrm2<T,T>(1, &x, 1);
+    EXPECT_EQ(0, norm);
+}
+
+TYPED_TEST(xnrm2, minFP) {
+    using T = TypeParam;
+    T x = std::numeric_limits<T>::min();
+
+    T norm = nrm2<T,T>(1, &x, 1);
+    EXPECT_EQ(x, norm);
+}
+
+TYPED_TEST(xnrm2, maxFP) {
+    using T = TypeParam;
+    T x = std::numeric_limits<T>::max();
+
+    T norm = nrm2<T,T>(1, &x, 1);
+    EXPECT_EQ(x, norm);
+}
+
+TEST(dnrm2, largeDouble) {
+    using T = double;
+    gtint_t n = 2;
+    std::vector<T> x{3e300, 4e300}, y{-4e300, -3e300};
+
+    T norm = nrm2<T,T>(n, x.data(), 1);
+    EXPECT_EQ(5e300, norm);
+
+    norm = nrm2<T,T>(n, y.data(), 1);
+    EXPECT_EQ(5e300, norm);
+}
diff --git a/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp
new file mode 100644
index 0000000000..e23bc0d90c
--- /dev/null
+++ b/gtestsuite/testsuite/util/nrm2/snrm2_generic.cpp
@@ -0,0 +1,101 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_nrm2.h"
+
+class snrm2Test :
+        public ::testing::TestWithParam<std::tuple<gtint_t, gtint_t, char>> {};
+
+TEST_P( snrm2Test, RandomData )
+{
+    using T = float;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<1>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<2>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = 2*n*testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_nrm2<T>(n, incx, thresh, datatype);
+}
+
+// Prints the test case combination
+class snrm2TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t, gtint_t, char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        gtint_t incx  = std::get<1>(str.param);
+        char datatype = std::get<2>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "snrm2_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_snrm2";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_snormfv";
+#endif
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        snrm2Test,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1), gtint_t(2)
+#ifndef TEST_BLIS_TYPED
+            ,gtint_t(-1), gtint_t(-2)
+#endif
+        ),                                                                   // stride size for x
+            ::testing::Values('i')                                           // i : integer, f : float  datatype type tested
+        ),
+        ::snrm2TestPrint()
+    );
diff --git a/gtestsuite/testsuite/util/nrm2/test_nrm2.h b/gtestsuite/testsuite/util/nrm2/test_nrm2.h
new file mode 100644
index 0000000000..2c9de86dc4
--- /dev/null
+++ b/gtestsuite/testsuite/util/nrm2/test_nrm2.h
@@ -0,0 +1,67 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#pragma once
+
+#include "nrm2.h"
+#include "util/ref_nrm2.h"
+#include "inc/check_error.h"
+
+template<typename T>
+void test_nrm2( gtint_t n, gtint_t incx, double thresh, char datatype )
+{
+    //----------------------------------------------------------
+    //        Initialize vectors with random numbers.
+    //----------------------------------------------------------
+    std::vector<T> x( testinghelpers::buff_dim(n, incx) );
+    testinghelpers::datagenerators::randomgenerators( -10, 10, n, incx, x.data(), datatype );
+
+    //----------------------------------------------------------
+    //    Call reference implementation to get ref results.
+    //----------------------------------------------------------
+    // Create a copy of y so that we can check reference results.
+    using real = typename testinghelpers::type_info<T>::real_type;
+    real norm_ref = testinghelpers::ref_nrm2<T, real>( n, x.data(), incx );
+
+    //----------------------------------------------------------
+    //                  Call BLIS function.
+    //----------------------------------------------------------
+    real norm = nrm2<T, real>(n, x.data(), incx);
+
+    //----------------------------------------------------------
+    //              Compute error.
+    //----------------------------------------------------------
+    computediff<real>( norm, norm_ref, thresh );
+}
+
diff --git a/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp b/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp
new file mode 100644
index 0000000000..55c1b9be07
--- /dev/null
+++ b/gtestsuite/testsuite/util/nrm2/znrm2_generic.cpp
@@ -0,0 +1,101 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <gtest/gtest.h>
+#include "test_nrm2.h"
+
+class znrm2Test :
+        public ::testing::TestWithParam<std::tuple<gtint_t, gtint_t, char>> {};
+
+TEST_P( znrm2Test, RandomData )
+{
+    using T = dcomplex;
+    //----------------------------------------------------------
+    // Initialize values from the parameters passed through
+    // test suite instantiation (INSTANTIATE_TEST_SUITE_P).
+    //----------------------------------------------------------
+    // vector length:
+    gtint_t n = std::get<0>(GetParam());
+    // stride size for x:
+    gtint_t incx = std::get<1>(GetParam());
+    // specifies the datatype for randomgenerators
+    char datatype = std::get<2>(GetParam());
+
+    // Set the threshold for the errors:
+    double thresh = testinghelpers::getEpsilon<T>();
+
+    //----------------------------------------------------------
+    //     Call test body using these parameters
+    //----------------------------------------------------------
+    test_nrm2<T>(n, incx, thresh, datatype);
+}
+
+// Prints the test case combination
+class znrm2TestPrint {
+public:
+    std::string operator()(
+        testing::TestParamInfo<std::tuple<gtint_t, gtint_t, char>> str) const {
+        gtint_t n     = std::get<0>(str.param);
+        gtint_t incx  = std::get<1>(str.param);
+        char datatype = std::get<2>(str.param);
+#ifdef TEST_BLAS
+        std::string str_name = "dznrm2_";
+#elif TEST_CBLAS
+        std::string str_name = "cblas_dznrm2";
+#else  //#elif TEST_BLIS_TYPED
+        std::string str_name = "bli_znormfv";
+#endif
+        str_name    = str_name + "_" + std::to_string(n);
+        std::string incx_str = ( incx > 0) ? std::to_string(incx) : "m" + std::to_string(std::abs(incx));
+        str_name    = str_name + "_" + incx_str;
+        str_name    = str_name + "_" + datatype;
+        return str_name;
+    }
+};
+
+// Black box testing.
+INSTANTIATE_TEST_SUITE_P(
+        Blackbox,
+        znrm2Test,
+        ::testing::Combine(
+            ::testing::Range(gtint_t(10), gtint_t(101), 10),                 // m size of vector takes values from 10 to 100 with step size of 10.
+            ::testing::Values(gtint_t(1), gtint_t(2)
+#ifndef TEST_BLIS_TYPED
+            ,gtint_t(-1), gtint_t(-2)
+#endif
+        ),                                                                   // stride size for x
+            ::testing::Values('i')                                           // i : integer, f : float  datatype type tested
+        ),
+        ::znrm2TestPrint()
+    );
diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt
index bee82f8685..47501d920c 100644
--- a/kernels/CMakeLists.txt
+++ b/kernels/CMakeLists.txt
@@ -7,4 +7,4 @@ if(${TARGET_ARCH} STREQUAL zen4 OR
    ${TARGET_ARCH} STREQUAL amdzen)
    add_subdirectory(skx)
    add_subdirectory(zen4)
-endif()
\ No newline at end of file
+endif()
diff --git a/kernels/haswell/1m/CMakeLists.txt b/kernels/haswell/1m/CMakeLists.txt
index 58ce19c61a..56abd13aec 100644
--- a/kernels/haswell/1m/CMakeLists.txt
+++ b/kernels/haswell/1m/CMakeLists.txt
@@ -1,7 +1,7 @@
 ##Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}" 
-     PRIVATE
+add_library(haswell_1m
+     OBJECT
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_c3xk.c
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_c8xk.c
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_d6xk.c
@@ -11,3 +11,8 @@ target_sources("${PROJECT_NAME}"
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_z3xk.c
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_haswell_asm_z4xk.c
  )
+
+target_compile_options(haswell_1m PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(haswell_1m PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
index ab42e06aa9..78e76589dc 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc.All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -337,6 +337,8 @@ void bli_cpackm_haswell_asm_3xk
 		  "xmm4", "xmm5", "xmm6", "xmm7",
 		  "xmm8", "xmm9", "xmm10", "xmm11",
 		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "ymm0", "ymm1", "ymm2", "ymm4", "ymm6",
+		  "ymm10", "ymm11", "ymm12", "ymm13",
 		  "memory"
 		)
 	}
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
index a101e66d18..61ace6945d 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -357,6 +357,8 @@ void bli_cpackm_haswell_asm_8xk
 		  "xmm4", "xmm5", "xmm6", "xmm7",
 		  "xmm8", "xmm9", "xmm10", "xmm11",
 		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+		  "ymm7", "ymm10", "ymm11", "ymm12", "ymm13",
 		  "memory"
 		)
 	}
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
index b64f26591d..e2982dbfeb 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -341,6 +341,8 @@ void bli_dpackm_haswell_asm_6xk
 		  "xmm4", "xmm5", "xmm6", "xmm7",
 		  "xmm8", "xmm9", "xmm10", "xmm11",
 		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+		  "ymm10", "ymm11", "ymm12", "ymm13",
 		  "memory"
 		)
 	}
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
index 0cfa2e8d68..e3b00a71e7 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -361,6 +361,8 @@ void bli_dpackm_haswell_asm_8xk
 		  "xmm4", "xmm5", "xmm6", "xmm7",
 		  "xmm8", "xmm9", "xmm10", "xmm11",
 		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+		  "ymm7", "ymm10", "ymm11", "ymm12", "ymm13",
 		  "memory"
 		)
 	}
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
index 40ac22bc55..b049fcdb5c 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -510,6 +510,9 @@ void bli_spackm_haswell_asm_16xk
 		  "xmm4", "xmm5", "xmm6", "xmm7",
 		  "xmm8", "xmm9", "xmm10", "xmm11",
 		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+		  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+		  "ymm13", "ymm14", "ymm15",
 		  "memory"
 		)
 	}
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
index 3a134bed8f..c05c36b66f 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -383,6 +383,8 @@ void bli_spackm_haswell_asm_6xk
 		  "xmm4", "xmm5", "xmm6", "xmm7",
 		  "xmm8", "xmm9", "xmm10", "xmm11",
 		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "ymm0", "ymm1", "ymm2", "ymm4", "ymm6",
+		  "ymm8", "ymm10", "ymm12", "ymm14",
 		  "memory"
 		)
 	}
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
index 06fcf1438a..cb025c1f01 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -343,6 +343,8 @@ void bli_zpackm_haswell_asm_3xk
 		  "xmm4", "xmm5", "xmm6", "xmm7",
 		  "xmm8", "xmm9", "xmm10", "xmm11",
 		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "ymm0", "ymm1", "ymm2", "ymm4", "ymm6",
+		  "ymm8", "ymm10", "ymm11", "ymm12",
 		  "memory"
 		)
 	}
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
index 25a8b6181e..e407fedf9f 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 2021, Advanced Micro Devices, Inc.All rights reserved.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -353,6 +353,8 @@ void bli_zpackm_haswell_asm_4xk
 		  "xmm4", "xmm5", "xmm6", "xmm7",
 		  "xmm8", "xmm9", "xmm10", "xmm11",
 		  "xmm12", "xmm13", "xmm14", "xmm15",
+		  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+		  "ymm7", "ymm8", "ymm10", "ymm11", "ymm12", "ymm14",
 		  "memory"
 		)
 	}
diff --git a/kernels/haswell/3/CMakeLists.txt b/kernels/haswell/3/CMakeLists.txt
index c3bd3b2ee5..a42bdadf83 100644
--- a/kernels/haswell/3/CMakeLists.txt
+++ b/kernels/haswell/3/CMakeLists.txt
@@ -1,11 +1,16 @@
-##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}" 
-     PRIVATE
+add_library(haswell_3
+     OBJECT
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_haswell_asm_d6x8.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_haswell_asm_d8x6.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_haswell_asm_d6x8.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_haswell_asm_d6x8.c
     )
 
+target_compile_options(haswell_3 PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(haswell_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
+
 add_subdirectory(sup)
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index 79625519c5..f0a8fe34c3 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc.All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -95,8 +95,8 @@ void bli_sgemm_haswell_asm_6x16
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = (uint64_t)k0 / 4;
+	uint64_t k_left = (uint64_t)k0 % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
@@ -907,6 +907,9 @@ void bli_sgemm_haswell_asm_6x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
@@ -957,8 +960,8 @@ void bli_dgemm_haswell_asm_6x8
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = (uint64_t)k0/4;
+	uint64_t k_left = (uint64_t)k0%4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
@@ -1664,6 +1667,9 @@ void bli_dgemm_haswell_asm_6x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
@@ -1720,8 +1726,8 @@ void bli_cgemm_haswell_asm_3x8
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = (uint64_t)k0 / 4;
+	uint64_t k_left = (uint64_t)k0 % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
@@ -2197,6 +2203,9 @@ void bli_cgemm_haswell_asm_3x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -2249,8 +2258,8 @@ void bli_zgemm_haswell_asm_3x4
 
 	// Typecast local copies of integers in case dim_t and inc_t are a
 	// different size than is expected by load instructions.
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
+	uint64_t k_iter = (uint64_t)k0 / 4;
+	uint64_t k_left = (uint64_t)k0 % 4;
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
@@ -2799,6 +2808,9 @@ void bli_zgemm_haswell_asm_3x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index 5df6d7a88a..02ea97b155 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -635,6 +636,9 @@ void bli_sgemm_haswell_asm_16x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
@@ -1222,6 +1226,9 @@ void bli_dgemm_haswell_asm_8x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_INFO);
@@ -1755,6 +1762,9 @@ void bli_cgemm_haswell_asm_8x3
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -2283,6 +2293,9 @@ void bli_zgemm_haswell_asm_4x3
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
index 5fd21e883a..939cab78f2 100644
--- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -822,6 +822,9 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1569,6 +1572,9 @@ void bli_dgemmtrsm_l_haswell_asm_6x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9);
diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
index f2032c69d8..bd9d338b3c 100644
--- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -826,6 +826,9 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1577,6 +1580,9 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9);
diff --git a/kernels/haswell/3/sup/CMakeLists.txt b/kernels/haswell/3/sup/CMakeLists.txt
index 6d13252de5..e5ed6183c2 100644
--- a/kernels/haswell/3/sup/CMakeLists.txt
+++ b/kernels/haswell/3/sup/CMakeLists.txt
@@ -1,7 +1,7 @@
-##Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}" 
-     PRIVATE
+add_library(haswell_3sup
+     OBJECT
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_d6x8m.c
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_d6x8n.c
      #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_s6x16m.c
@@ -11,6 +11,9 @@ target_sources("${PROJECT_NAME}"
      #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_s6x16m.c
      #${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_s6x16n.c
     )
+target_compile_options(haswell_3sup PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(haswell_3sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
 add_subdirectory(d6x8)
 #add_subdirectory(s6x16)
-
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
index 990358db8b..dc81b2d913 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -684,6 +684,9 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -986,14 +989,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -1004,14 +1006,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 	// ---------------------------------- iteration 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 1
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 2
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 3
 	SUBITER_K4_3x4(rax, rbx)
@@ -1029,7 +1031,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	SUBITER_K4_3x4(rax, rbx)
 
@@ -1089,6 +1091,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -1154,7 +1157,6 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -1279,14 +1281,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -1296,14 +1297,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 	// ---------------------------------- iteration 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 1
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 2
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 3
 	SUBITER_K4_3x4(rax, rbx)
@@ -1323,7 +1324,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -1387,6 +1388,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -1449,14 +1451,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -1467,14 +1468,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 	// ---------------------------------- iteration 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 1
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 2
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 3
 	SUBITER_K4_3x4(rax, rbx)
@@ -1493,7 +1494,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	SUBITER_K4_3x4(rax, rbx)
 
@@ -1564,6 +1565,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -1635,12 +1637,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1744,15 +1749,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -1763,14 +1767,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U
 	// ---------------------------------- iteration 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_2x4(rax, rbx)
 	// ---------------------------------- iteration 1
 	SUBITER_K4_2x4(rax, rbx)
 	// ---------------------------------- iteration 2
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_2x4(rax, rbx)
 	// ---------------------------------- iteration 3
 	SUBITER_K4_2x4(rax, rbx)
@@ -1789,7 +1793,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_2x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -1859,6 +1863,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -1927,12 +1932,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -2043,15 +2051,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -2064,14 +2071,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 1
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 2
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 3
 	SUBITER_K4_3x4(rax, rbx)
@@ -2091,7 +2098,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -2160,6 +2167,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -2242,7 +2250,6 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -2386,15 +2393,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -2406,14 +2412,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 	// ---------------------------------- iteration 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 1
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 2
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 3
 	SUBITER_K4_3x4(rax, rbx)
@@ -2433,7 +2439,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -2502,6 +2508,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -2568,15 +2575,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -2589,7 +2595,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -2649,7 +2655,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -2720,7 +2726,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -2839,6 +2845,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -2910,12 +2917,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -3000,15 +3010,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -3021,7 +3030,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -3071,7 +3080,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -3132,7 +3141,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -3241,6 +3250,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -3309,12 +3319,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_combined_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -3419,15 +3432,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -3439,14 +3451,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 	// ---------------------------------- iteration 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 1
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 2
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 3
 	SUBITER_K4_3x4(rax, rbx)
@@ -3466,7 +3478,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -3535,6 +3547,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -3601,15 +3614,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -3622,7 +3634,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -3632,7 +3644,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -3653,7 +3665,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -3722,6 +3734,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -3812,15 +3825,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -3833,7 +3845,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -3844,7 +3856,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -3866,7 +3878,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -3936,6 +3948,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -4002,15 +4015,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -4023,7 +4035,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -4034,7 +4046,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -4056,7 +4068,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -4126,6 +4138,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -4193,12 +4206,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -4303,15 +4319,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -4324,7 +4339,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -4334,7 +4349,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -4355,7 +4370,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -4424,6 +4439,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -4498,15 +4514,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -4519,7 +4534,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_1x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -4529,7 +4544,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_1x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -4550,7 +4565,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_1x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -4593,6 +4608,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -4646,12 +4662,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -4757,15 +4776,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -4778,7 +4796,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -4788,7 +4806,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -4809,7 +4827,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -4878,6 +4896,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -4944,15 +4963,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -4965,7 +4983,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -4975,7 +4993,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -4996,7 +5014,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -5065,6 +5083,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -5151,15 +5170,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -5172,7 +5190,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -5182,7 +5200,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -5203,7 +5221,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -5272,6 +5290,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -5338,15 +5357,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -5359,7 +5377,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -5369,7 +5387,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -5390,7 +5408,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -5459,6 +5477,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -5526,12 +5545,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -5635,15 +5657,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -5656,7 +5677,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_2x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -5666,7 +5687,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_2x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -5687,7 +5708,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_2x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -5756,6 +5777,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -5830,15 +5852,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -5851,7 +5872,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -5861,7 +5882,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -5882,7 +5903,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -5952,6 +5973,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -6018,15 +6040,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -6039,7 +6060,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -6049,7 +6070,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -6070,7 +6091,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -6139,6 +6160,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -6214,12 +6236,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -6324,15 +6349,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -6345,7 +6369,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -6355,7 +6379,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -6376,7 +6400,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -6446,6 +6470,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -6516,15 +6541,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -6537,7 +6561,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -6547,7 +6571,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -6568,7 +6592,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -6637,6 +6661,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -6728,14 +6753,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -6748,7 +6772,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -6797,7 +6821,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -6857,7 +6881,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -6953,6 +6977,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -7016,12 +7041,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_0x0_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -7127,15 +7155,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -7148,7 +7175,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -7158,7 +7185,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -7179,7 +7206,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -7248,6 +7275,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -7314,15 +7342,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -7335,7 +7362,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -7345,7 +7372,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -7366,7 +7393,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -7435,6 +7462,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -7517,15 +7545,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -7538,7 +7565,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -7548,7 +7575,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -7569,7 +7596,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -7638,6 +7665,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -7708,15 +7736,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -7729,7 +7756,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -7739,7 +7766,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -7760,7 +7787,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -7829,6 +7856,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -7896,12 +7924,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x0_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -8006,15 +8037,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -8027,7 +8057,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
 
@@ -8065,7 +8095,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
 
@@ -8114,7 +8144,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
 
@@ -8187,6 +8217,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -8241,15 +8272,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -8262,7 +8292,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -8272,7 +8302,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -8293,7 +8323,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -8362,6 +8392,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -8433,12 +8464,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_6x8_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -8543,15 +8577,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -8564,7 +8597,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -8574,7 +8607,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -8595,7 +8628,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -8664,6 +8697,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -8729,14 +8763,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -8749,7 +8782,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -8759,7 +8792,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -8780,7 +8813,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -8848,6 +8881,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -8926,14 +8960,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -8946,7 +8979,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 1
@@ -8956,7 +8989,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	// ---------------------------------- iteration 3
@@ -8977,7 +9010,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 	SUBITER_K4_3x4(rax, rbx)
 
 	dec(rsi)                           // i -= 1;
@@ -9046,6 +9079,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -9113,14 +9147,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -9133,7 +9166,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 1
@@ -9142,7 +9175,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	SUBITER_K4_3x4(rax, rbx)
 	// ---------------------------------- iteration 3
@@ -9163,7 +9196,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	SUBITER_K4_3x4(rax, rbx)
 
@@ -9233,6 +9266,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -9299,12 +9333,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x8_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -9419,14 +9456,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -9439,7 +9475,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -9489,7 +9525,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -9549,7 +9585,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -9645,6 +9681,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -9707,12 +9744,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_12x16_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -9833,14 +9873,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -9853,7 +9892,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -9903,7 +9942,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -9964,7 +10003,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
@@ -10060,6 +10099,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -10121,12 +10161,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -10176,14 +10219,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -10196,7 +10238,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -10256,7 +10298,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -10327,7 +10369,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -10445,6 +10487,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -10512,14 +10555,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -10532,7 +10574,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -10592,7 +10634,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -10663,7 +10705,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -10780,6 +10822,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -10858,14 +10901,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -10878,7 +10920,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -10922,7 +10964,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -10977,7 +11019,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
@@ -11050,6 +11092,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -11101,14 +11144,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -11121,7 +11163,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -11181,7 +11223,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -11252,7 +11294,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -11370,6 +11412,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -11438,12 +11481,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_16x12_combined_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -11549,14 +11595,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -11569,7 +11614,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -11629,7 +11674,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -11700,7 +11745,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -11818,6 +11863,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -11885,14 +11931,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -11905,7 +11950,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -11965,7 +12010,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -12036,7 +12081,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -12154,6 +12199,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -12233,15 +12279,14 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -12254,7 +12299,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -12298,7 +12343,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -12353,7 +12398,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
@@ -12436,6 +12481,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -12488,14 +12534,13 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 	vmovapd( ymm4, ymm14)
 	vmovapd( ymm4, ymm15)
 
-	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	lea(mem(r14), rax)                 // rax = a_ii;
 	lea(mem(rdx), rbx)                 // rbx = b_jj;
 
-	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
-	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
+	prefetch(0, mem(r12,         3*8)) // prefetch c + 0*rs_c
+	prefetch(0, mem(r12, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+	prefetch(0, mem(r12, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+	lea(mem(r8,  r8,  4), rcx)         // rcx = 5*rs_a
 
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
@@ -12508,7 +12553,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -12568,7 +12613,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -12639,7 +12684,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
-	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
+	prefetch(0, mem(rax, rcx, 1, 0*8)) // prefetch rax + 5*rs_a
 
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
@@ -12756,6 +12801,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
 
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
+	lea(mem(r12), rcx)                 // rcx = c_iijj;
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
 
@@ -12824,12 +12870,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8m_18x16_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -13369,6 +13418,9 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -13979,12 +14031,15 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
index cd00e19760..65c985ef1a 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -315,21 +315,20 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
 #endif
-	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -363,7 +362,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -399,7 +398,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -466,32 +465,32 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -513,21 +512,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -535,12 +534,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -562,22 +561,22 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -616,7 +615,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -624,73 +623,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -712,7 +711,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -735,12 +734,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -838,7 +840,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -856,7 +858,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -909,21 +911,20 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
 #endif
-	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -957,7 +958,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -993,7 +994,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1060,32 +1061,32 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1107,21 +1108,21 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1129,12 +1130,12 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1156,22 +1157,21 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1209,7 +1209,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	                                   // ymm6[2] = sum(ymm12); ymm6[3] = sum(ymm15)
 
 
-	
+
 
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
@@ -1218,73 +1218,73 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -1300,7 +1300,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1323,12 +1323,15 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1427,7 +1430,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1445,7 +1448,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1493,21 +1496,20 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 #endif
-	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1536,7 +1538,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -1567,7 +1569,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1624,31 +1626,31 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1666,21 +1668,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1688,11 +1690,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1710,21 +1712,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1751,7 +1753,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1759,65 +1761,65 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -1833,7 +1835,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1856,12 +1858,14 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14",
 	  "memory"
 	)
 
@@ -1959,7 +1963,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	//mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1977,7 +1981,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2020,21 +2024,20 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*rs_c
 #endif
-	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2058,7 +2061,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -2084,7 +2087,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -2131,30 +2134,30 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -2168,21 +2171,21 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2190,10 +2193,10 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -2207,20 +2210,20 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -2235,7 +2238,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -2243,57 +2246,57 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -2309,7 +2312,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -2332,12 +2335,13 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13",
 	  "memory"
 	)
 
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
index f764bc613e..9962e1a95e 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -717,6 +717,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1328,6 +1331,9 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1941,6 +1947,9 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -2525,6 +2534,9 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -3157,6 +3169,9 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
index 1fe862a8d1..3af06075a8 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -752,6 +752,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1348,6 +1351,9 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1885,6 +1891,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14",
 	  "memory"
 	)
 
@@ -2362,6 +2370,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13",
 	  "memory"
 	)
 
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
index 8ac3612bdf..05c240d2d1 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019 - 22, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -890,12 +890,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1587,12 +1590,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+	  "ymm8", "ymm10", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1946,12 +1951,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x8_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+	  "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 
@@ -2191,13 +2198,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x16_L
 			[a_next] "m" (a_next),
 			[b_next] "m" (b_next)*/
 			: // register clobber list
-			"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+			"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 			"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 			"xmm0", "xmm1", "xmm2", "xmm3",
 			"xmm4", "xmm5", "xmm6", "xmm7",
 			"xmm8", "xmm9", "xmm10", "xmm11",
 			"xmm12", "xmm13", "xmm14", "xmm15",
-			"memory"
+			"ymm0", "ymm3", "ymm4", "ymm12", "ymm14",
+	 		"memory"
 			)
 }
 
@@ -2331,8 +2339,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
 		label(.DPOSTPFETCH)
 		mov(var(ps_a8), rdx)
 		lea(mem(rax, rdx, 1), rdx)	//rdx = a + ps_a8		//for prefetch
-		mov(var(ps_a8), rbp)
-		lea(mem(r11, rbp, 1), rbp)	//rdx = a + ps_a8		//for prefetch
+		mov(var(ps_a8), rcx)
+		lea(mem(r11, rcx, 1), rcx)	//rdx = a + ps_a8		//for prefetch
 		mov(var(k_iter), rsi)
 		test(rsi, rsi)
 		je(.DCONSILEFT)
@@ -2341,7 +2349,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
 		label(.DMAIN)
 		//0
 		prefetch(0, mem(rdx, 5*8))
-		prefetch(0, mem(rbp, 5*8))
+		prefetch(0, mem(rcx, 5*8))
 		vmovupd(mem(rbx,  0*32), ymm0)
 		vmovupd(mem(rbx,  1*32), ymm1)
 
@@ -2373,7 +2381,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
 		add(r9, rax)
 		//1
 		prefetch(0, mem(rdx, r9, 1, 5*8))
-		prefetch(0, mem(rbp, r9, 1, 5*8))
+		prefetch(0, mem(rcx, r9, 1, 5*8))
 		vmovupd(mem(rbx,  0*32), ymm0)
 		vmovupd(mem(rbx,  1*32), ymm1)
 
@@ -2405,7 +2413,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
 		add(r9, rax)
 		//2
 		prefetch(0, mem(rdx, r9, 2, 5*8))
-		prefetch(0, mem(rbp, r9, 2, 5*8))
+		prefetch(0, mem(rcx, r9, 2, 5*8))
 		vmovupd(mem(rbx,  0*32), ymm0)
 		vmovupd(mem(rbx,  1*32), ymm1)
 
@@ -2436,10 +2444,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
 		add(r10, rbx)
 		add(r9, rax)
 		//3
-		prefetch(0, mem(rdx, rcx, 1, 5*8))
-		prefetch(0, mem(rbp, rcx, 1, 5*8))
-		lea(mem(rdx, r9,  4), rdx)
-		lea(mem(rbp, r9,  4), rbp)
+		lea(mem(rdx, r9,  2), rdx)
+		lea(mem(rcx, r9,  2), rcx)
+		prefetch(0, mem(rdx, r9, 1, 5*8))
+		prefetch(0, mem(rcx, r9, 1, 5*8))
+		lea(mem(rdx, r9,  2), rdx)
+		lea(mem(rcx, r9,  2), rcx)
 
 		vmovupd(mem(rbx,  0*32), ymm0)
 		vmovupd(mem(rbx,  1*32), ymm1)
@@ -2481,8 +2491,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
 
 		label(.DLEFT)
 		prefetch(0, mem(rdx, 5*8))
-		prefetch(0, mem(rbp, 5*8))
-		add(r9, rbp)
+		prefetch(0, mem(rcx, 5*8))
+		add(r9, rcx)
 		add(r9, rdx)
 		vmovupd(mem(rbx,  0*32), ymm0)
 		vmovupd(mem(rbx,  1*32), ymm1)
@@ -2836,12 +2846,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_16x12_combined_L
 			[a_next] "m" (a_next),
 			[b_next] "m" (b_next)*/
 			: // register clobber list
-			"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+			"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 			"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 			"xmm0", "xmm1", "xmm2", "xmm3",
 			"xmm4", "xmm5", "xmm6", "xmm7",
 			"xmm8", "xmm9", "xmm10", "xmm11",
 			"xmm12", "xmm13", "xmm14", "xmm15",
+			"ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+			"ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+			"ymm12", "ymm13", "ymm14", "ymm15",
 			"memory"
 			)
 	}
@@ -3455,12 +3468,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x0_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -4084,12 +4100,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x8_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -4688,12 +4707,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_18x16_L
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -5214,12 +5236,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -5771,12 +5796,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x8_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -6327,12 +6355,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x16_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -6578,12 +6609,13 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_6x0_U
 			[a_next] "m" (a_next),
 			[b_next] "m" (b_next)*/
 			: // register clobber list
-			"rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+			"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 			"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 			"xmm0", "xmm1", "xmm2", "xmm3",
 			"xmm4", "xmm5", "xmm6", "xmm7",
 			"xmm8", "xmm9", "xmm10", "xmm11",
 			"xmm12", "xmm13", "xmm14", "xmm15",
+			"ymm0", "ymm2", "ymm3", "ymm5", "ymm7",
 			"memory"
 			)
 }
@@ -6953,12 +6985,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_12x8_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm5",
+	  "ymm7", "ymm9", "ymm11",
 	  "memory"
 	)
 }
@@ -7440,12 +7474,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_18x16_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm13", "ymm15",
 	  "memory"
 	)
 }
@@ -7615,9 +7652,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
 
 	vmovupd(mem(rbx,  1*64), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	lea(mem(rax, r13, 2), rbp)
-	vbroadcastsd(mem(rbp       ), ymm2)
-	vbroadcastsd(mem(rbp, r8, 1), ymm3)
+	lea(mem(rax, r13, 2), r11)
+	vbroadcastsd(mem(r11       ), ymm2)
+	vbroadcastsd(mem(r11, r8, 1), ymm3)
 	vfmadd231pd(ymm1, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
@@ -7652,9 +7689,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
 
 	vmovupd(mem(rbx,  1*64), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	lea(mem(rax, r13, 2), rbp)
-	vbroadcastsd(mem(rbp       ), ymm2)
-	vbroadcastsd(mem(rbp, r8, 1), ymm3)
+	lea(mem(rax, r13, 2), r11)
+	vbroadcastsd(mem(r11       ), ymm2)
+	vbroadcastsd(mem(r11, r8, 1), ymm3)
 	vfmadd231pd(ymm1, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
@@ -7689,9 +7726,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
 
 	vmovupd(mem(rbx,  1*64), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	lea(mem(rax, r13, 2), rbp)
-	vbroadcastsd(mem(rbp       ), ymm2)
-	vbroadcastsd(mem(rbp, r8, 1), ymm3)
+	lea(mem(rax, r13, 2), r11)
+	vbroadcastsd(mem(r11       ), ymm2)
+	vbroadcastsd(mem(r11, r8, 1), ymm3)
 	vfmadd231pd(ymm1, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	add(r9, rax)                       // a += cs_a;
@@ -7725,9 +7762,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
 
 	vmovupd(mem(rbx,  1*64), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	lea(mem(rax, r13, 2), rbp)
-	vbroadcastsd(mem(rbp       ), ymm2)
-	vbroadcastsd(mem(rbp, r8, 1), ymm3)
+	lea(mem(rax, r13, 2), r11)
+	vbroadcastsd(mem(r11       ), ymm2)
+	vbroadcastsd(mem(r11, r8, 1), ymm3)
 	vfmadd231pd(ymm1, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	add(r9, rax)                       // a += cs_a;
@@ -7771,9 +7808,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
 
 	vmovupd(mem(rbx,  1*64), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	lea(mem(rax, r13, 2), rbp)
-	vbroadcastsd(mem(rbp       ), ymm2)
-	vbroadcastsd(mem(rbp, r8, 1), ymm3)
+	lea(mem(rax, r13, 2), r11)
+	vbroadcastsd(mem(r11       ), ymm2)
+	vbroadcastsd(mem(r11, r8, 1), ymm3)
 	vfmadd231pd(ymm1, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	add(r9, rax)                       // a += cs_a;
@@ -7909,12 +7946,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
 	vunpckhpd(ymm14, ymm12, ymm1)
 	vextractf128(imm(0x1), ymm0, xmm2)
 	vextractf128(imm(0x1), ymm1, xmm4)
-	lea(mem(rcx, 6*8), rbp)
-	lea(mem(rbp, rsi, 2), rbp)
-	vfmadd231pd(mem(rbp ), xmm3, xmm2)
-	vfmadd231pd(mem(rbp, rsi, 1), xmm3, xmm4)
-	vmovlpd(xmm2, mem(rbp))
-	vmovupd(xmm4, mem(rbp, rsi, 1))
+	lea(mem(rcx, 6*8), r11)
+	lea(mem(r11, rsi, 2), r11)
+	vfmadd231pd(mem(r11 ), xmm3, xmm2)
+	vfmadd231pd(mem(r11, rsi, 1), xmm3, xmm4)
+	vmovlpd(xmm2, mem(r11))
+	vmovupd(xmm4, mem(r11, rsi, 1))
 
 	lea(mem(rdx, rsi, 4), rdx)
 
@@ -8022,11 +8059,11 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
 	vunpckhpd(ymm14, ymm12, ymm1)
 	vextractf128(imm(0x1), ymm0, xmm2)
 	vextractf128(imm(0x1), ymm1, xmm4)
-	lea(mem(rcx, rdi, 4), rbp)
-	lea(mem(rbp, rdi, 2), rbp)
-	lea(mem(rbp, rsi, 2), rbp)
-	vmovlpd(xmm2, mem(rbp))
-	vmovupd(xmm4, mem(rbp, rsi, 1))
+	lea(mem(rcx, rdi, 4), r11)
+	lea(mem(r11, rdi, 2), r11)
+	lea(mem(r11, rsi, 2), r11)
+	vmovlpd(xmm2, mem(r11))
+	vmovupd(xmm4, mem(r11, rsi, 1))
 
 	lea(mem(rdx, rsi, 4), rdx)
 
@@ -8079,12 +8116,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m_0x0_combined_U
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -8775,12 +8815,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -9433,12 +9476,14 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
+	  "ymm6", "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 
@@ -10066,12 +10111,14 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
+	  "ymm6", "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
index a473ae33c2..4cdc763b67 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -866,12 +866,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1616,12 +1619,15 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13",
 	  "memory"
 	)
 
@@ -2275,12 +2281,14 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 
@@ -2955,12 +2963,15 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -3516,12 +3527,14 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
+	  "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 
@@ -4041,12 +4054,13 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
index c299047ff9..d1c251bcbd 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -1030,6 +1030,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1897,6 +1900,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -2610,6 +2616,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
+	  "ymm6", "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 
@@ -3356,6 +3364,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+	  "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 
@@ -4025,6 +4035,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm4", "ymm6",
+	  "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 
@@ -4664,6 +4676,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm4", "ymm6",
+	  "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
index 7463707cc9..af4ab52a02 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -982,6 +982,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1914,6 +1917,9 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13",
 	  "memory"
 	)
 
@@ -2672,6 +2678,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 
@@ -3491,6 +3499,9 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10",
+	  "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -4129,6 +4140,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
+	  "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 
@@ -4780,6 +4793,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 
diff --git a/kernels/haswell/3/sup/d6x8/CMakeLists.txt b/kernels/haswell/3/sup/d6x8/CMakeLists.txt
index ce3bade013..c74dff9372 100644
--- a/kernels/haswell/3/sup/d6x8/CMakeLists.txt
+++ b/kernels/haswell/3/sup/d6x8/CMakeLists.txt
@@ -1,7 +1,7 @@
-##Copyright (C) 2020-2021, Advanced Micro Devices, Inc. All rights reserved.## 
+##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}" 
-     PRIVATE
+add_library(haswell_3supd6x8
+     OBJECT
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_r_haswell_ref_dMx1.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx1.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_haswell_asm_dMx2.c
@@ -13,4 +13,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx6.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_haswell_asm_dMx8.c
     )
 
-
+target_compile_options(haswell_3supd6x8 PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(haswell_3supd6x8 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
index 8d3900f2e8..6d9dd365ee 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -564,12 +564,14 @@ void bli_dgemmsup_rd_haswell_asm_6x1
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm3", "ymm4", "ymm6", "ymm8",
+	  "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 }
@@ -973,12 +975,13 @@ void bli_dgemmsup_rd_haswell_asm_3x1
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm3", "ymm4", "ymm6", "ymm8",
 	  "memory"
 	)
 }
@@ -1347,12 +1350,13 @@ void bli_dgemmsup_rd_haswell_asm_2x1
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm3", "ymm4", "ymm6",
 	  "memory"
 	)
 }
@@ -1686,12 +1690,13 @@ void bli_dgemmsup_rd_haswell_asm_1x1
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm3", "ymm4",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
index af498eb0ee..94a8e9639e 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -612,6 +612,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1045,6 +1048,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9",
 	  "memory"
 	)
 }
@@ -1437,6 +1442,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 }
@@ -1788,6 +1794,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
index f19b703b41..01e2d0a3dd 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -171,7 +171,6 @@ void bli_dgemmsup_rd_haswell_asm_6x4
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
 #endif
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
 
 
@@ -583,12 +582,15 @@ void bli_dgemmsup_rd_haswell_asm_6x4
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1039,6 +1041,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14",
 	  "memory"
 	)
 }
@@ -1442,6 +1446,7 @@ void bli_dgemmsup_rd_haswell_asm_1x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
index 571444bed3..9b97a40a45 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -254,7 +254,6 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
 #endif
-	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
 	
 
@@ -674,12 +673,15 @@ void bli_dgemmsup_rd_haswell_asm_6x8
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1187,12 +1189,14 @@ void bli_dgemmsup_rd_haswell_asm_2x8
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14",
 	  "memory"
 	)
 }
@@ -1611,6 +1615,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
index eb1118196b..7c2fd21e1e 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -163,7 +163,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+
 	prefetch(0, mem(rcx,         5*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c
 
@@ -544,12 +544,13 @@ void bli_dgemmsup_rv_haswell_asm_6x2
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
 	  "memory"
 	)
 }
@@ -635,7 +636,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+
 	prefetch(0, mem(rcx,         4*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c
 
@@ -994,12 +995,13 @@ void bli_dgemmsup_rv_haswell_asm_5x2
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
 	  "memory"
 	)
 }
@@ -1084,7 +1086,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+
 	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
 
@@ -1402,12 +1404,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
 	  "memory"
 	)
 }
@@ -1491,7 +1494,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+
 	prefetch(0, mem(rcx,         2*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
 
@@ -1807,12 +1810,14 @@ void bli_dgemmsup_rv_haswell_asm_3x2
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+	  "ymm8", "ymm10",
 	  "memory"
 	)
 }
@@ -1895,7 +1900,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+
 	prefetch(0, mem(rcx,         1*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
 
@@ -2157,12 +2162,13 @@ void bli_dgemmsup_rv_haswell_asm_2x2
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3",
 	  "memory"
 	)
 }
@@ -2244,7 +2250,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	//lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+
 	prefetch(0, mem(rcx,         0*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
 
@@ -2484,12 +2490,13 @@ void bli_dgemmsup_rv_haswell_asm_1x2
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
index bdcf833e3d..ad43e7ba57 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -115,9 +115,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -163,38 +163,38 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+	lea(mem(rsi, rsi, 2), rdx)         // rdx = 3*cs_c;
 	prefetch(0, mem(rcx,         5*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(rcx, rsi, 2, 5*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c
+	prefetch(0, mem(rcx, rdx, 1, 5*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -202,19 +202,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -228,25 +228,25 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -254,18 +254,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -280,43 +280,43 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -324,57 +324,57 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -383,42 +383,42 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -466,45 +466,45 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
-	
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -539,13 +539,13 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vmovupd(xmm4, mem(rdx, rax, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -566,12 +566,14 @@ void bli_dgemmsup_rv_haswell_asm_6x4
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+	  "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 }
@@ -657,11 +659,11 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+	lea(mem(rsi, rsi, 2), rdx)         // rdx = 3*cs_c;
 	prefetch(0, mem(rcx,         4*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 4*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(rcx, rsi, 2, 4*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rcx, rbp, 1, 4*8)) // prefetch c + 3*cs_c
+	prefetch(0, mem(rcx, rdx, 1, 4*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
 
@@ -1037,12 +1039,14 @@ void bli_dgemmsup_rv_haswell_asm_5x4
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
+	  "ymm6", "ymm8", "ymm10", "ymm12",
 	  "memory"
 	)
 }
@@ -1127,11 +1131,11 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+	lea(mem(rsi, rsi, 2), rdx)         // rdx = 3*cs_c;
 	prefetch(0, mem(rcx,         3*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(rcx, rsi, 2, 3*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c
+	prefetch(0, mem(rcx, rdx, 1, 3*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
 	
@@ -1457,12 +1461,14 @@ void bli_dgemmsup_rv_haswell_asm_4x4
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
+	  "ymm6", "ymm8", "ymm10",
 	  "memory"
 	)
 }
@@ -1546,11 +1552,11 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+	lea(mem(rsi, rsi, 2), rdx)         // rdx = 3*cs_c;
 	prefetch(0, mem(rcx,         2*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(rcx, rsi, 2, 2*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c
+	prefetch(0, mem(rcx, rdx, 1, 2*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
 	
@@ -1884,12 +1890,14 @@ void bli_dgemmsup_rv_haswell_asm_3x4
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
+	  "ymm6", "ymm8", "ymm10",
 	  "memory"
 	)
 }
@@ -1972,11 +1980,11 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+	lea(mem(rsi, rsi, 2), rdx)         // rdx = 3*cs_c;
 	prefetch(0, mem(rcx,         1*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(rcx, rsi, 2, 1*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c
+	prefetch(0, mem(rcx, rdx, 1, 1*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
 	
@@ -2247,12 +2255,13 @@ void bli_dgemmsup_rv_haswell_asm_2x4
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
 	  "memory"
 	)
 }
@@ -2334,11 +2343,11 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 
 	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
 	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(double)
-	lea(mem(rsi, rsi, 2), rbp)         // rbp = 3*cs_c;
+	lea(mem(rsi, rsi, 2), rdx)         // rdx = 3*cs_c;
 	prefetch(0, mem(rcx,         0*8)) // prefetch c + 0*cs_c
 	prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
 	prefetch(0, mem(rcx, rsi, 2, 0*8)) // prefetch c + 2*cs_c
-	prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c
+	prefetch(0, mem(rcx, rdx, 1, 0*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
 	
@@ -2588,12 +2597,13 @@ void bli_dgemmsup_rv_haswell_asm_1x4
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
index 9da1e7b838..9f80ef2f0d 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -682,12 +682,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1257,12 +1260,14 @@ void bli_dgemmsup_rv_haswell_asm_5x6
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
 	  "memory"
 	)
 }
@@ -1755,12 +1760,14 @@ void bli_dgemmsup_rv_haswell_asm_4x6
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 }
@@ -2270,12 +2277,14 @@ void bli_dgemmsup_rv_haswell_asm_3x6
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 }
@@ -2701,12 +2710,14 @@ void bli_dgemmsup_rv_haswell_asm_2x6
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 }
@@ -3078,12 +3089,13 @@ void bli_dgemmsup_rv_haswell_asm_1x6
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
index a6c8f0e43d..2a04011f37 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -798,12 +798,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1393,12 +1396,14 @@ void bli_dgemmsup_rv_haswell_asm_5x8
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
 	  "memory"
 	)
 }
@@ -1901,12 +1906,14 @@ void bli_dgemmsup_rv_haswell_asm_4x8
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 }
@@ -2435,12 +2442,14 @@ void bli_dgemmsup_rv_haswell_asm_3x8
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 }
@@ -2858,12 +2867,13 @@ void bli_dgemmsup_rv_haswell_asm_2x8
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 }
@@ -3248,12 +3258,13 @@ void bli_dgemmsup_rv_haswell_asm_1x8
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
 	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi","rbp",
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
 	  "xmm0", "xmm1", "xmm2", "xmm3",
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
index 1eb8d926c9..fe6d124d32 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -582,6 +582,9 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -997,6 +1000,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "ymm9",
 	  "memory"
 	)
 }
@@ -1375,6 +1379,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 }
@@ -1719,6 +1724,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
index 1d3d88309f..b7b0b46a1b 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -628,6 +628,9 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1105,6 +1108,8 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14",
 	  "memory"
 	)
 }
@@ -1520,6 +1525,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
index bbb75a6fcd..9819671c7d 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -703,6 +703,9 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1222,6 +1225,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14",
 	  "memory"
 	)
 }
@@ -1642,6 +1647,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
index 1e3240350b..190eb9d1d7 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -630,6 +630,9 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1072,6 +1075,8 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm6", "ymm7", "ymm8", "ymm9",
 	  "memory"
 	)
 }
@@ -1469,6 +1474,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 }
@@ -1822,6 +1828,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
index 9d4e9d51d2..d167bc08fb 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -600,6 +600,9 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1057,6 +1060,8 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
+	  "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14",
 	  "memory"
 	)
 }
@@ -1451,6 +1456,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
index 788912ecf6..498002da90 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -628,6 +628,9 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13","ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1105,6 +1108,8 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm7",
+	  "ymm8", "ymm10", "ymm11", "ymm13", "ymm14",
 	  "memory"
 	)
 }
@@ -1520,6 +1525,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm7", "ymm10", "ymm13",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
index 1bea78ee73..dd2c392e9c 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -775,6 +775,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1474,6 +1477,8 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
 	  "memory"
 	)
 }
@@ -2031,6 +2036,8 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 }
@@ -2619,6 +2626,8 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9",
 	  "memory"
 	)
 }
@@ -3065,6 +3074,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 }
@@ -3512,6 +3522,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
index 6a08cecd43..f6443e8b50 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -916,6 +916,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12",
+	  "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 }
@@ -1672,6 +1675,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
 	  "memory"
 	)
 }
@@ -2257,6 +2262,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 }
@@ -2898,6 +2905,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+	  "ymm7", "ymm8", "ymm9",
 	  "memory"
 	)
 }
@@ -3367,6 +3376,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 }
@@ -3849,6 +3859,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
index ac4e1ee0b0..1d80111ea8 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -662,6 +662,9 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6", "ymm8", "ymm10",
+	  "ymm12", "ymm14",
 	  "memory"
 	)
 }
@@ -1236,6 +1239,9 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6", "ymm8", "ymm10",
+	  "ymm12",
 	  "memory"
 	)
 }
@@ -1723,6 +1729,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6", "ymm8", "ymm10",
 	  "memory"
 	)
 }
@@ -2211,6 +2219,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4",
+	  "ymm6", "ymm8",
 	  "memory"
 	)
 }
@@ -2611,6 +2621,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4",
+	  "ymm6",
 	  "memory"
 	)
 }
@@ -3000,6 +3012,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4",
 	  "memory"
 	)
 }
@@ -3389,6 +3402,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4",
 	  "memory"
 	)
 }
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
index 2b1a221ada..43210cdc5a 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -628,6 +628,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+	  "ymm8", "ymm10", "ymm12", "ymm14",
 	  "memory"
 	)
 }
@@ -1179,6 +1181,8 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+	  "ymm8", "ymm10", "ymm12",
 	  "memory"
 	)
 }
@@ -1636,6 +1640,8 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm6",
+	  "ymm8", "ymm10",
 	  "memory"
 	)
 }
@@ -2116,6 +2122,7 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm6", "ymm8",
 	  "memory"
 	)
 }
@@ -2502,6 +2509,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4", "ymm6",
 	  "memory"
 	)
 }
@@ -2889,6 +2897,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm2", "ymm3", "ymm4",
 	  "memory"
 	)
 }
diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c
index 1e52805fec..c7c176b48d 100644
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -137,4 +137,4 @@ void bli_i16gemm_power10_mma_8x16
         SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
         SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
-}
\ No newline at end of file
+}
diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c
index 1ddafd7229..d497b5aad9 100644
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -137,4 +137,4 @@ void bli_i16sgemm_power10_mma_8x16
         SAVE_ACC_bz(iv4sf_t, &acc6, rs_c,  8+4*rs_c);
         SAVE_ACC_bz(iv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
-}
\ No newline at end of file
+}
diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c
index 7a65157aef..05f4dbaebc 100644
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -138,4 +138,4 @@ void bli_sbgemm_power10_mma_8x16
         SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
 
-}
\ No newline at end of file
+}
diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c
index 196bc085fb..333d08f3ec 100644
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -141,4 +141,4 @@ void bli_sgemm_power10_mma_8x16
         SAVE_ACC_bz(fv4sf_t, &acc6, rs_c,  8+4*rs_c);
         SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
-}
\ No newline at end of file
+}
diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c
index 8a16fdc063..8619bf315f 100644
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -138,4 +138,4 @@ void bli_shgemm_power10_mma_8x16
         SAVE_ACC_bz(fv4sf_t, &acc7, rs_c, 12+4*rs_c);
     }
 
-}
\ No newline at end of file
+}
diff --git a/kernels/power10/3/vector_int_macros.h b/kernels/power10/3/vector_int_macros.h
index dbd09b6d9a..6ad7552df6 100644
--- a/kernels/power10/3/vector_int_macros.h
+++ b/kernels/power10/3/vector_int_macros.h
@@ -68,4 +68,4 @@ typedef __vector unsigned char vec_t;
     rowC[0] = alpha_ * result[2];                      \
     rowC = (v_t *) &C0[3*rs_c+j];                   \
     rowC[0] = alpha_ * result[3];
-    
\ No newline at end of file
+    
diff --git a/kernels/power9/bli_kernels_power9.h b/kernels/power9/bli_kernels_power9.h
index 9f4d08ccb2..9df9979c6d 100644
--- a/kernels/power9/bli_kernels_power9.h
+++ b/kernels/power9/bli_kernels_power9.h
@@ -35,4 +35,4 @@
 // -- level-3 --
 
 // gemm (asm d12x6)
-GEMM_UKR_PROT( double,   d, gemm_power9_asm_12x6 )
\ No newline at end of file
+GEMM_UKR_PROT( double,   d, gemm_power9_asm_12x6 )
diff --git a/kernels/skx/3/CMakeLists.txt b/kernels/skx/3/CMakeLists.txt
index 30857ba975..e4125f1b60 100644
--- a/kernels/skx/3/CMakeLists.txt
+++ b/kernels/skx/3/CMakeLists.txt
@@ -1,7 +1,11 @@
-##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}" 
-     PRIVATE
+add_library(skx_3
+     OBJECT
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_skx_asm_16x14.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_sgemm_skx_asm_32x12_l2.c
     )
+target_compile_options(skx_3 PRIVATE /arch:AVX2 /arch:AVX512)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(skx_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/skx/CMakeLists.txt b/kernels/skx/CMakeLists.txt
index bc8f1eaab3..a9ba638da8 100644
--- a/kernels/skx/CMakeLists.txt
+++ b/kernels/skx/CMakeLists.txt
@@ -1,4 +1,4 @@
-##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
-
-add_subdirectory(3)
+##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.##
+remove_definitions(/arch:AVX2)
 
+add_subdirectory(3)
\ No newline at end of file
diff --git a/kernels/zen/1/CMakeLists.txt b/kernels/zen/1/CMakeLists.txt
index dbdd1533e2..87db4ac1c7 100644
--- a/kernels/zen/1/CMakeLists.txt
+++ b/kernels/zen/1/CMakeLists.txt
@@ -1,7 +1,7 @@
 ##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}"
-     PRIVATE
+add_library(zen_1
+     OBJECT
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_amaxv_zen_int.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpbyv_zen_int.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpbyv_zen_int10.c
@@ -16,4 +16,9 @@ target_sources("${PROJECT_NAME}"
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_setv_zen_int.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_swapv_zen_int8.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_norm2_zen_int.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_scal2v_zen_int.c
     )
+target_compile_options(zen_1 PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen_1 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c
index 7f799fa628..3adb524799 100644
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2016 - 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2018, The University of Texas at Austin	
 
    Redistribution and use in source and binary forms, with or without
@@ -255,8 +255,8 @@ void bli_samaxv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// later, especially if BLIS is compiled with -mfpmath=sse).
+	// transitioning from AVX to SSE instructions (which may occur later,
+	// especially if BLIS is compiled with -mfpmath=sse).
 	_mm256_zeroupper();
 
 	/* Store final index to output variable. */
@@ -265,159 +265,648 @@ void bli_samaxv_zen_int
 }
 
 // -----------------------------------------------------------------------------
-void bli_damaxv_zen_int
-     (
-       dim_t            n,
-       double* restrict x, inc_t incx,
-       dim_t*  restrict i_max,
-       cntx_t* restrict cntx
-     )
-{
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
-	double* minus_one = PASTEMAC(d,m1);
-	dim_t*  zero_i    = PASTEMAC(i,0);
 
-	double  chi1_r;
-	//double  chi1_i;
-	double  abs_chi1;
-	double  abs_chi1_max;
-	dim_t   i_max_l;
-	dim_t   i;
 
-	/* If the vector length is zero, return early. This directly emulates
-	   the behavior of netlib BLAS's i?amax() routines. */
-	if ( bli_zero_dim1( n ) )
+/*
+	This macro takes a __m128d vector and a double pointer as inputs. It
+	stores the largest element in the __m128d in the address pointed by the
+	double pointer.
+
+	Signature
+	----------
+
+	* 'max_res' - __m128d
+	* 'max_val' - Double pointer
+*/
+#define _mm_vec_max_pd(max_res, max_val)\
+	*(max_val) = (max_res[0] >= max_res[1]) ? max_res[0] : max_res[1];
+
+/*
+	Functionality
+	--------------
+
+	This function finds the first occurence of the absolute largest element in a double
+	array and the range (start and end index) in which that element can be found.
+
+	Function signature
+	-------------------
+
+	This function takes a void pointer as input (internally casted to a double pointer)
+	which points to an array of type double, the correspending array's stride and length.
+	It uses the function parameters to return the output.
+
+	* 'x' - Void pointer pointing to an array of type double
+	* 'incx' - Stride to point to the next element in the array
+	* 'n' - Length of the array passed
+	* 'max_num' - Double pointer to the memory of the absolute largest element
+	* 'start_index', 'end_index' - Range in which the largest element can be found
+
+	The function has been made static to restrict its scope.
+
+	Exception
+	----------
+
+	1. When the length of the array or the increment is zero set the absolute maximum, start
+	   index and end index to -1.
+*/
+static void bli_vec_absmax_double
+(
+	const void *x, dim_t incx, dim_t n,
+	double *abs_max_num,
+	dim_t *start_index, dim_t *end_index
+)
+{
+	/*
+		When the length of the array or the increment
+		is zero set the absolute maximum, start index and
+		end index to -1.
+	*/
+	if ( bli_zero_dim1( n ) || bli_zero_dim1( incx ) )
 	{
-		PASTEMAC(i,copys)( *zero_i, *i_max );
-        AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
+		*abs_max_num = -1;
+		*start_index = *end_index = -1;
+
 		return;
 	}
 
-	/* Initialize the index of the maximum absolute value to zero. */ \
-	PASTEMAC(i,copys)( *zero_i, i_max_l );
+	// Cast the void pointer to double
+	double *temp_ptr = (double *)x;
+	double temp_max_val, curr_max_val = -1;
 
-	/* Initialize the maximum absolute value search candidate with
-	   -1, which is guaranteed to be less than all values we will
-	   compute. */
-	PASTEMAC(d,copys)( *minus_one, abs_chi1_max );
+	dim_t window_start, window_end, i = 0;
+	window_start = window_end = 0;
 
-	// For non-unit strides, or very small vector lengths, compute with
-	// scalar code.
-	if ( incx != 1 || n < 4 )
+	/*
+		When incx == 1 and n >= 2 the compute can be
+		vectorized using AVX-2 or SSE instructions
+	*/
+	if (incx == 1 && n >= 2)
 	{
-		for ( i = 0; i < n; ++i )
+		dim_t const n_elem_per_reg = 4;
+
+		__m256d x_vec[12], max_array, sign_mask;
+		v2dd_t max_hi, max_lo, sign_mask_128;
+
+		// Initializing the mask to minus zero (-0.0)
+		sign_mask = _mm256_set1_pd(-0.f);
+		sign_mask_128.v = _mm_set1_pd(-0.f);
+
+		for (i = 0; (i + 47) < n; i += 48)
 		{
-			double* chi1 = x + (i  )*incx;
+			// Load the elements into YMM registers
+			x_vec[0] = _mm256_loadu_pd(temp_ptr);
+			x_vec[1] = _mm256_loadu_pd(temp_ptr + n_elem_per_reg);
+			x_vec[2] = _mm256_loadu_pd(temp_ptr + 2 * n_elem_per_reg);
+			x_vec[3] = _mm256_loadu_pd(temp_ptr + 3 * n_elem_per_reg);
+
+			/*
+				Calculate the absolute value of the elements
+				and store it in the same vectors
+			*/
+			x_vec[0] = _mm256_andnot_pd(sign_mask, x_vec[0]);
+			x_vec[1] = _mm256_andnot_pd(sign_mask, x_vec[1]);
+			x_vec[2] = _mm256_andnot_pd(sign_mask, x_vec[2]);
+			x_vec[3] = _mm256_andnot_pd(sign_mask, x_vec[3]);
+
+			/*
+				Find the largest element in the corresponding
+				vector indices for the given set of 256-bit vectors
+			*/
+			x_vec[0] = _mm256_max_pd(x_vec[0], x_vec[1]);
+			x_vec[2] = _mm256_max_pd(x_vec[2], x_vec[3]);
+			x_vec[0] = _mm256_max_pd(x_vec[0], x_vec[2]);
+
+			x_vec[4] = _mm256_loadu_pd(temp_ptr + 4 * n_elem_per_reg);
+			x_vec[5] = _mm256_loadu_pd(temp_ptr + 5 * n_elem_per_reg);
+			x_vec[6] = _mm256_loadu_pd(temp_ptr + 6 * n_elem_per_reg);
+			x_vec[7] = _mm256_loadu_pd(temp_ptr + 7 * n_elem_per_reg);
+
+			x_vec[4] = _mm256_andnot_pd(sign_mask, x_vec[4]);
+			x_vec[5] = _mm256_andnot_pd(sign_mask, x_vec[5]);
+			x_vec[6] = _mm256_andnot_pd(sign_mask, x_vec[6]);
+			x_vec[7] = _mm256_andnot_pd(sign_mask, x_vec[7]);
+
+			x_vec[4] = _mm256_max_pd(x_vec[4], x_vec[5]);
+			x_vec[6] = _mm256_max_pd(x_vec[6], x_vec[7]);
+			x_vec[4] = _mm256_max_pd(x_vec[4], x_vec[6]);
+
+			x_vec[8] = _mm256_loadu_pd(temp_ptr + 8 * n_elem_per_reg);
+			x_vec[9] = _mm256_loadu_pd(temp_ptr + 9 * n_elem_per_reg);
+			x_vec[10] = _mm256_loadu_pd(temp_ptr + 10 * n_elem_per_reg);
+			x_vec[11] = _mm256_loadu_pd(temp_ptr + 11 * n_elem_per_reg);
+
+			x_vec[8] = _mm256_andnot_pd(sign_mask, x_vec[8]);
+			x_vec[9] = _mm256_andnot_pd(sign_mask, x_vec[9]);
+			x_vec[10] = _mm256_andnot_pd(sign_mask, x_vec[10]);
+			x_vec[11] = _mm256_andnot_pd(sign_mask, x_vec[11]);
+
+			x_vec[8] = _mm256_max_pd(x_vec[8], x_vec[9]);
+			x_vec[10] = _mm256_max_pd(x_vec[10], x_vec[11]);
+			x_vec[8] = _mm256_max_pd(x_vec[10], x_vec[8]);
+
+			max_array = _mm256_max_pd(x_vec[0], x_vec[4]);
+
+			/*
+				max_array holds the largest element in
+				the corresponding vector indices
+			*/
+			max_array = _mm256_max_pd(max_array, x_vec[8]);
+
+			// Extract the higher and lower 128-bit from the max_array
+			max_hi.v = _mm256_extractf128_pd(max_array, 1);
+			max_lo.v = _mm256_extractf128_pd(max_array, 0);
+
+			/*
+				Find the largest element in the corresponding
+				vector indices for the given set of 128-bit vectors
+			*/
+			max_hi.v = _mm_max_pd(max_hi.v, max_lo.v);
+
+			/*
+				Find the largest element in the 128-bit vector
+				and store it in temp_max_val
+			*/
+			_mm_vec_max_pd(max_hi.d, &temp_max_val);
+
+			/*
+				If the new max value found is greater than the previous
+				max value, update the range and the largest value.
+			*/
+			if (curr_max_val < temp_max_val)
+			{
+				window_start = i;
+				window_end = window_start + (12 * n_elem_per_reg);
 
-			/* Get the real and imaginary components of chi1. */
-			chi1_r = *chi1;
+				curr_max_val = temp_max_val;
+			}
 
-			/* Replace chi1_r and chi1_i with their absolute values. */
-			chi1_r = fabs( chi1_r );
+			// Increment the pointer
+			temp_ptr += 12 * n_elem_per_reg;
+		}
 
-			/* Add the real and imaginary absolute values together. */
-			abs_chi1 = chi1_r;
+		for (; (i + 31) < n; i += 32)
+		{
+			x_vec[0] = _mm256_loadu_pd(temp_ptr);
+			x_vec[1] = _mm256_loadu_pd(temp_ptr + n_elem_per_reg);
+			x_vec[2] = _mm256_loadu_pd(temp_ptr + 2 * n_elem_per_reg);
+			x_vec[3] = _mm256_loadu_pd(temp_ptr + 3 * n_elem_per_reg);
 
-			/* If the absolute value of the current element exceeds that of
-			   the previous largest, save it and its index. If NaN is
-			   encountered, then treat it the same as if it were a valid
-			   value that was smaller than any previously seen. This
-			   behavior mimics that of LAPACK's i?amax(). */
-			if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
+			x_vec[0] = _mm256_andnot_pd(sign_mask, x_vec[0]);
+			x_vec[1] = _mm256_andnot_pd(sign_mask, x_vec[1]);
+			x_vec[2] = _mm256_andnot_pd(sign_mask, x_vec[2]);
+			x_vec[3] = _mm256_andnot_pd(sign_mask, x_vec[3]);
+
+			x_vec[0] = _mm256_max_pd(x_vec[0], x_vec[1]);
+			x_vec[2] = _mm256_max_pd(x_vec[2], x_vec[3]);
+			x_vec[0] = _mm256_max_pd(x_vec[0], x_vec[2]);
+
+			x_vec[4] = _mm256_loadu_pd(temp_ptr + 4 * n_elem_per_reg);
+			x_vec[5] = _mm256_loadu_pd(temp_ptr + 5 * n_elem_per_reg);
+			x_vec[6] = _mm256_loadu_pd(temp_ptr + 6 * n_elem_per_reg);
+			x_vec[7] = _mm256_loadu_pd(temp_ptr + 7 * n_elem_per_reg);
+
+			x_vec[4] = _mm256_andnot_pd(sign_mask, x_vec[4]);
+			x_vec[5] = _mm256_andnot_pd(sign_mask, x_vec[5]);
+			x_vec[6] = _mm256_andnot_pd(sign_mask, x_vec[6]);
+			x_vec[7] = _mm256_andnot_pd(sign_mask, x_vec[7]);
+
+			x_vec[4] = _mm256_max_pd(x_vec[4], x_vec[5]);
+			x_vec[6] = _mm256_max_pd(x_vec[6], x_vec[7]);
+			x_vec[4] = _mm256_max_pd(x_vec[4], x_vec[6]);
+
+			max_array = _mm256_max_pd(x_vec[0], x_vec[4]);
+
+			max_hi.v = _mm256_extractf128_pd(max_array, 1);
+			max_lo.v = _mm256_extractf128_pd(max_array, 0);
+
+			max_hi.v = _mm_max_pd(max_hi.v, max_lo.v);
+
+			_mm_vec_max_pd(max_hi.d, &temp_max_val);
+
+			if (curr_max_val < temp_max_val)
 			{
-				abs_chi1_max = abs_chi1;
-				i_max_l      = i;
+				window_start = i;
+				window_end = window_start + (8 * n_elem_per_reg);
+
+				curr_max_val = temp_max_val;
 			}
+
+			temp_ptr += 8 * n_elem_per_reg;
 		}
-	}
-	else
-	{
-		dim_t  n_iter, n_left;
-		dim_t  num_vec_elements = 4;
-		v4df_t x_vec, max_vec, maxInx_vec, mask_vec;
-		v4df_t idx_vec, inc_vec;
-		v4df_t sign_mask;
 
-		v2dd_t max_vec_lo, max_vec_hi, mask_vec_lo;
-		v2dd_t maxInx_vec_lo, maxInx_vec_hi;
+		for (; (i + 15) < n; i += 16)
+		{
+			x_vec[0] = _mm256_loadu_pd(temp_ptr);
+			x_vec[1] = _mm256_loadu_pd(temp_ptr + n_elem_per_reg);
+			x_vec[2] = _mm256_loadu_pd(temp_ptr + 2 * n_elem_per_reg);
+			x_vec[3] = _mm256_loadu_pd(temp_ptr + 3 * n_elem_per_reg);
 
-		n_iter = n / num_vec_elements;
-		n_left = n % num_vec_elements;
+			x_vec[0] = _mm256_andnot_pd(sign_mask, x_vec[0]);
+			x_vec[1] = _mm256_andnot_pd(sign_mask, x_vec[1]);
+			x_vec[2] = _mm256_andnot_pd(sign_mask, x_vec[2]);
+			x_vec[3] = _mm256_andnot_pd(sign_mask, x_vec[3]);
 
-		idx_vec.v    = _mm256_set_pd( 3, 2, 1, 0 );
-		inc_vec.v    = _mm256_set1_pd( 4 );
-		max_vec.v    = _mm256_set1_pd( -1 );
-		maxInx_vec.v = _mm256_setzero_pd();
-		sign_mask.v  = _mm256_set1_pd( -0.f );
+			x_vec[0] = _mm256_max_pd(x_vec[0], x_vec[1]);
+			x_vec[2] = _mm256_max_pd(x_vec[2], x_vec[3]);
 
-		for ( i = 0; i < n_iter; ++i )
+			max_array = _mm256_max_pd(x_vec[0], x_vec[2]);
+
+			max_hi.v = _mm256_extractf128_pd(max_array, 1);
+			max_lo.v = _mm256_extractf128_pd(max_array, 0);
+
+			max_hi.v = _mm_max_pd(max_hi.v, max_lo.v);
+
+			_mm_vec_max_pd(max_hi.d, &temp_max_val);
+
+			if (curr_max_val < temp_max_val)
+			{
+				window_start = i;
+				window_end = window_start + (4 * n_elem_per_reg);
+
+				curr_max_val = temp_max_val;
+			}
+
+			temp_ptr += 4 * n_elem_per_reg;
+		}
+
+		for (; (i + 7) < n; i += 8)
 		{
-			x_vec.v      = _mm256_loadu_pd( x );
+			x_vec[0] = _mm256_loadu_pd(temp_ptr);
+			x_vec[1] = _mm256_loadu_pd(temp_ptr + n_elem_per_reg);
 
-			// Get the absolute value of the vector element.
-			x_vec.v      = _mm256_andnot_pd( sign_mask.v, x_vec.v );
+			x_vec[0] = _mm256_andnot_pd(sign_mask, x_vec[0]);
+			x_vec[1] = _mm256_andnot_pd(sign_mask, x_vec[1]);
 
-			mask_vec.v   = CMP256( d, x_vec.v, max_vec.v );
+			max_array = _mm256_max_pd(x_vec[0], x_vec[1]);
 
-			max_vec.v    = _mm256_blendv_pd( max_vec.v, x_vec.v, mask_vec.v );
-			maxInx_vec.v = _mm256_blendv_pd( maxInx_vec.v, idx_vec.v, mask_vec.v );
+			max_hi.v = _mm256_extractf128_pd(max_array, 1);
+			max_lo.v = _mm256_extractf128_pd(max_array, 0);
 
-			idx_vec.v += inc_vec.v;
-			x         += num_vec_elements;
+			max_hi.v = _mm_max_pd(max_hi.v, max_lo.v);
+
+			_mm_vec_max_pd(max_hi.d, &temp_max_val);
+
+			if (curr_max_val < temp_max_val)
+			{
+				window_start = i;
+				window_end = window_start + (2 * n_elem_per_reg);
+
+				curr_max_val = temp_max_val;
+			}
+
+			temp_ptr += 2 * n_elem_per_reg;
 		}
 
-		max_vec_lo.v    = _mm256_extractf128_pd( max_vec.v, 0 );
-		max_vec_hi.v    = _mm256_extractf128_pd( max_vec.v, 1 );
-		maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 );
-		maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 );
-		
-		mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
+		for (; (i + 3) < n; i += 4)
+		{
+			max_array = _mm256_loadu_pd(temp_ptr);
+			max_array = _mm256_andnot_pd(sign_mask, max_array);
 
-		max_vec_lo.v    = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
-		maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
-		
-		max_vec_hi.v    = _mm_permute_pd( max_vec_lo.v, 1 );
-		maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 );
-		
-		mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
+			max_hi.v = _mm256_extractf128_pd(max_array, 1);
+			max_lo.v = _mm256_extractf128_pd(max_array, 0);
 
-		max_vec_lo.v    = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
-		maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
+			max_hi.v = _mm_max_pd(max_hi.v, max_lo.v);
 
-		abs_chi1_max = max_vec_lo.d[0];
-		i_max_l      = maxInx_vec_lo.d[0];
+			_mm_vec_max_pd(max_hi.d, &temp_max_val);
 
-		for ( i = n - n_left; i < n; i++ )
+			if (curr_max_val < temp_max_val)
+			{
+				window_start = i;
+				window_end = window_start + n_elem_per_reg;
+
+				curr_max_val = temp_max_val;
+			}
+
+			temp_ptr += n_elem_per_reg;
+		}
+
+		for (; (i + 1) < n; i += 2)
 		{
-			double* chi1 = x;
+			max_hi.v = _mm_loadu_pd(temp_ptr);
+			max_hi.v = _mm_andnot_pd(sign_mask_128.v, max_hi.v);
 
-			/* Get the real and imaginary components of chi1. */
-			chi1_r = *chi1;
+			_mm_vec_max_pd(max_hi.d, &temp_max_val);
 
-			/* Replace chi1_r and chi1_i with their absolute values. */
-			abs_chi1 = fabs( chi1_r );
+			if (curr_max_val < temp_max_val)
+			{
+				window_start = i;
+				window_end = window_start + 2;
 
-			/* If the absolute value of the current element exceeds that of
-			   the previous largest, save it and its index. If NaN is
-			   encountered, return the index of the first NaN. This
-			   behavior mimics that of LAPACK's i?amax(). */
-			if ( abs_chi1_max < abs_chi1 || ( isnan( abs_chi1 ) && !isnan( abs_chi1_max ) ) )
+				curr_max_val = temp_max_val;
+			}
+
+			temp_ptr += 2;
+		}
+	}
+
+	/*
+		This loops performs the compute in two cases:
+
+		1. The complete compute when incx != 1
+		2. It process the last element when 'n' is not a
+		   multiple of 2.
+	*/
+	for (; i < n; ++i)
+	{
+		temp_max_val = fabs(*temp_ptr);
+
+		if (temp_max_val > curr_max_val)
+		{
+			curr_max_val = temp_max_val;
+			window_end = i;
+			window_start = i;
+		}
+
+		temp_ptr += incx;
+	}
+
+	/*
+		Store the index range  in which the largest element
+		can be found in the address passed
+	*/
+	*start_index = window_start;
+	*end_index = window_end;
+
+	// Store the value of the largest element in the address passed
+	*abs_max_num = curr_max_val;
+}
+
+
+/*
+	Functionality
+	-------------
+
+	This function locates the index at which a given absolute value of an element
+	first occurs.
+
+	Function Signature
+	-------------------
+
+	This function takes a void pointer as input (internally casted to a double pointer)
+	which points to an array of type double, the correspending array's stride and length.
+	It uses the function parameters to return the output.
+
+	* 'x' - Void pointer pointing to an array of type double
+	* 'incx' - Stride to point to the next element in the array
+	* 'n' - Length of the array passed
+	* 'element' - Double pointer to the memory of the element to be searched
+	* 'index' - Range in which the largest element can be found
+
+	The function has been made static to restrict its scope.
+
+	Exception
+	----------
+
+	1. When the length of the array or the increment is zero set the index to -1.
+*/
+static void bli_vec_search_double
+(
+    const void* x, dim_t incx,
+    dim_t n,
+    double* element,
+    dim_t* index
+)
+{
+	/*
+		When the length of the array or the
+		increment is zero set the index to -1.
+	*/
+	if (bli_zero_dim1(n) || bli_zero_dim1(incx))
+	{
+		*index = -1;
+
+		return;
+	}
+
+	double *temp_ptr = (double *)x;
+
+	dim_t i = 0;
+
+	/*
+		When incx == 1 and n >= 2 the compute can be
+		vectorized using AVX-2 or SSE instructions.
+
+		This vectorization does not reduce the total
+		number of comparisons performed but vectorizes the
+		calculation of the absolute values.
+	*/
+	if (incx == 1 && n >= 2)
+	{
+		const dim_t n_elem_per_reg = 4;
+
+		__m256d x_vec, max_reg, mask_gen;
+
+		// Initializing the mask to minus zero (-0.0)
+		__m256d sign_mask = _mm256_set1_pd(-0.f);
+
+		/*
+			Set the register to the absolute
+			value of the element passed
+		*/
+		max_reg = _mm256_set1_pd(fabs(*element));
+
+		for (i = 0; (i + 3)< n; i += n_elem_per_reg)
+		{
+			// Load the array elements into the register
+			x_vec = _mm256_loadu_pd(temp_ptr);
+
+			// Calculate the absolute values of the elements
+			x_vec = _mm256_andnot_pd(sign_mask, x_vec);
+
+			/*
+				Check for equality with the absolute value
+				of the element to be searched for
+			*/
+			mask_gen = _mm256_cmp_pd(x_vec, max_reg, _CMP_EQ_OQ);
+
+			/*
+				Check if the element exists in the loaded vector
+				using the mask generated.
+
+				The mask is generated because comparison to zero is
+				a cheaper operation.
+			*/
+			for (dim_t j = 0; j < n_elem_per_reg; ++j)
 			{
-				abs_chi1_max = abs_chi1;
-				i_max_l      = i;
+				double mask_val = mask_gen[j];
+
+				if (mask_val != 0)
+				{
+					*index = i + j;
+					return;
+				}
 			}
 
-			x += 1;
+			temp_ptr += n_elem_per_reg;
+		}
+
+		/*
+			Issue vzeroupper instruction to clear upper lanes of ymm registers.
+			This avoids a performance penalty caused by false dependencies when
+			transitioning from AVX to SSE instructions (which may occur as soon
+			as the n_left cleanup loop below if BLIS is compiled with
+			-mfpmath=sse).
+		*/
+		_mm256_zeroupper();
+
+		// Perform the above compute using SSE instructions
+		__m128d x_vec_128, mask_gen_128, max_reg_128;
+
+		max_reg_128 = _mm_set1_pd(*element);
+
+		for (; (i + 1) < n; i += 2)
+		{
+			x_vec_128 = _mm_loadu_pd(temp_ptr);
+			x_vec_128 = _mm_andnot_pd(_mm_set1_pd(-0.f), x_vec_128);
+
+			mask_gen_128 = _mm_cmp_pd(x_vec_128, max_reg_128, _CMP_EQ_OQ);
+
+			for (dim_t j = 0; j < 2; ++j)
+			{
+				double mask_val = mask_gen_128[j];
+
+				if (mask_val != 0)
+				{
+					*index = i + j;
+					return;
+				}
+			}
+
+			temp_ptr += 2;
 		}
 	}
 
-	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
-	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// later, especially if BLIS is compiled with -mfpmath=sse).
-	_mm256_zeroupper();
+	/*
+		This loops performs the compute in two cases:
 
-	/* Store final index to output variable. */
-	*i_max = i_max_l;
-    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
+		1. The complete compute when incx != 1
+		2. It process the last element when 'n' is not a
+		   multiple of 2.
+	*/
+	for (; i < n; i += 1)
+	{
+		double value = fabs(*temp_ptr);
+
+		if (value == *element)
+		{
+			*index = i;
+			return;
+		}
+
+		temp_ptr += incx;
+	}
+
+	// When the element is not found in the
+	*index = -2;
+}
+
+/*
+	Functionality
+	-------------
+
+	This function finds the index of the first element having maximum absolute value
+	with the array index starting from 0.
+
+	Function Signature
+	-------------------
+
+	This function takes a double pointer as input, the correspending vector's stride
+	and length. It uses the function parameters to return the output.
+
+	* 'x' - Double pointer pointing to an array
+	* 'incx' - Stride to point to the next element in the array
+	* 'n' - Length of the array passed
+	* 'i_max' - Index at which the absolute largest element can be found
+	* 'cntx' - BLIS context object
+
+	Exception
+	----------
+
+	1. When the vector length is zero, return early. This directly emulates the behavior
+	   of netlib BLAS's i?amax() routines.
+
+	Deviation from BLAS
+	--------------------
+
+	1. In this function, the array index starts from 0 while in BLAS the indexing
+	   starts from 1. The deviation is expected to be handled in the BLAS layer of
+	   the library.
+
+	Undefined behaviour
+	-------------------
+
+	1. The function results in undefined behaviour when NaN elements are present in the
+	   array. This behaviour is BLAS complaint.
+*/
+void bli_damaxv_zen_int
+     (
+       dim_t            n,
+       double* restrict x, inc_t incx,
+       dim_t*  restrict i_max,
+       cntx_t* restrict cntx
+     )
+{
+	// Temproray pointer used inside the function
+	double *x_temp = x;
+
+	// Will hold the absolute largest element in the array
+	double max_val;
+
+	/*
+		Holds the index range in which the absolute
+		largest element first occurs
+	*/
+	dim_t start_index, end_index;
+
+	/*
+		Length of the search space where the absolute
+		largest element first occurs
+	*/
+	dim_t search_len;
+
+	/*
+		This function find the first occurence of the absolute largest element in a double
+		array and the range (start and end index) in which that element can be found.
+	*/
+	bli_vec_absmax_double
+	(
+		(void *)x_temp, incx, n,
+		&max_val,
+		&start_index, &end_index
+	);
+
+	// Calculate the length of the search space
+	search_len = end_index - start_index;
+
+	dim_t element_index;
+
+	if (start_index != end_index)
+	{
+		// Adjust the pointer based on the search range
+		x_temp = x + (start_index * incx);
+
+		/*
+			This function locates the index at which a given absolute
+			value of a element first occurs.
+		*/
+		bli_vec_search_double
+		(
+			(void *)x_temp, incx,
+			search_len,
+			&max_val,
+			&element_index
+		);
+
+		// Calculate the index the of the absolute largest element and store it
+		element_index = start_index + element_index;
+	}
+	else
+	{
+		// Store the index the of the absolute largest element
+		element_index = start_index;
+	}
+
+	// Store final index to output variable.
+	*i_max = element_index;
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
 }
diff --git a/kernels/zen/1/bli_axpbyv_zen_int.c b/kernels/zen/1/bli_axpbyv_zen_int.c
index 05ef96175a..c92d44ad3e 100644
--- a/kernels/zen/1/bli_axpbyv_zen_int.c
+++ b/kernels/zen/1/bli_axpbyv_zen_int.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -83,7 +83,7 @@ void bli_saxpbyv_zen_int
 
 	/* if the vector dimension is zero, or if alpha & beta are zero,
 	   return early. */
-	if ( bli_zero_dim1( n ) || 
+	if ( bli_zero_dim1( n ) ||
 		 ( PASTEMAC( s, eq0 )( *alpha ) && PASTEMAC( s, eq0 )( *beta ) ) )
 		 return;
 
@@ -114,7 +114,7 @@ void bli_saxpbyv_zen_int
 
 			// y := y' + alpha * x
 			y0v.v = _mm256_fmadd_ps
-					( 
+					(
 					  alphav.v,
 					  _mm256_loadu_ps( x0 + 0*n_elem_per_reg ),
 					  y0v.v
@@ -150,8 +150,8 @@ void bli_saxpbyv_zen_int
 		
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();
 
@@ -181,7 +181,7 @@ void bli_saxpbyv_zen_int
 /**
  * daxpbyv kernel performs the axpbyv operation.
  * y := beta * y + alpha * conjx(x)
- * where, 
+ * where,
  * 		x & y are double precision vectors of length n.
  * 		alpha & beta are scalers.
  */
@@ -211,7 +211,7 @@ void bli_daxpbyv_zen_int
 
 	/* if the vector dimension is zero, or if alpha & beta are zero,
 	   return early. */
-	if ( bli_zero_dim1( n ) || 
+	if ( bli_zero_dim1( n ) ||
 		 ( PASTEMAC( s, eq0 )( *alpha ) && PASTEMAC( s, eq0 )( *beta ) ) )
 	{
 		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
@@ -282,8 +282,8 @@ void bli_daxpbyv_zen_int
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();
 
@@ -312,7 +312,7 @@ void bli_daxpbyv_zen_int
 /**
  * caxpbyv kernel performs the axpbyv operation.
  * y := beta * y + alpha * conjx(x)
- * where, 
+ * where,
  * 		x & y are simple complex vectors of length n.
  * 		alpha & beta are scalers.
  */
@@ -349,7 +349,7 @@ void bli_caxpbyv_zen_int
 	
 	/* if the vector dimension is zero, or if alpha & beta are zero,
 	   return early. */
-	if ( bli_zero_dim1( n ) || 
+	if ( bli_zero_dim1( n ) ||
 		 ( PASTEMAC( c, eq0 )( *alpha ) && PASTEMAC( c, eq0 )( *beta ) ) )
 	{
 		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
@@ -371,7 +371,7 @@ void bli_caxpbyv_zen_int
 		// y = beta*y + alpha*x
 		// y = ( bR + ibI ) * ( yR + iyI ) + ( aR + iaI ) * ( xR + ixI )
 		// y = bR.yR + ibR.yI + ibI.yR - ibIyI + aR.xR + iaR.xI + iaI.xR - aI.xI
-		// y =   ( bR.yR - bI.yI + aR.xR - aI.xI ) + 
+		// y =   ( bR.yR - bI.yI + aR.xR - aI.xI ) +
 		//	   i ( bR.yI + bI.yR + aR.xI + aI.xR )
 
 		// SIMD Algorithm BLIS_NO_CONJUGATE
@@ -424,8 +424,8 @@ void bli_caxpbyv_zen_int
 			// betaIv  = -bI   bI  -bI   bI  -bI   bI  -bI   bI
 			alphaRv = _mm256_broadcast_ss( &alphaR );
 			alphaIv = _mm256_set_ps
-					  ( 
-						alphaI, -alphaI, alphaI, -alphaI, 
+					  (
+						alphaI, -alphaI, alphaI, -alphaI,
 					    alphaI, -alphaI, alphaI, -alphaI
 					  );
 			betaRv  = _mm256_broadcast_ss( &betaR );
@@ -505,10 +505,15 @@ void bli_caxpbyv_zen_int
 
 			// yv = alphaIv * xv + yv
 			//    = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ...
-			yv[0] = _mm256_fmadd_ps( alphaIv, xv[0], yv[0] );
-			yv[1] = _mm256_fmadd_ps( alphaIv, xv[1], yv[1] );
-			yv[2] = _mm256_fmadd_ps( alphaIv, xv[2], yv[2] );
-			yv[3] = _mm256_fmadd_ps( alphaIv, xv[3], yv[3] );
+			iv[0] = _mm256_fmadd_ps( alphaIv, xv[0], iv[0] );
+			iv[1] = _mm256_fmadd_ps( alphaIv, xv[1], iv[1] );
+			iv[2] = _mm256_fmadd_ps( alphaIv, xv[2], iv[2] );
+			iv[3] = _mm256_fmadd_ps( alphaIv, xv[3], iv[3] );
+
+			yv[0] = _mm256_add_ps( yv[0], iv[0] );
+			yv[1] = _mm256_add_ps( yv[1], iv[1] );
+			yv[2] = _mm256_add_ps( yv[2], iv[2] );
+			yv[3] = _mm256_add_ps( yv[3], iv[3] );
 
 			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), yv[0] );
 			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), yv[1] );
@@ -562,9 +567,13 @@ void bli_caxpbyv_zen_int
 
 			// yv = alphaIv * xv + yv
 			//    = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ...
-			yv[0] = _mm256_fmadd_ps( alphaIv, xv[0], yv[0] );
-			yv[1] = _mm256_fmadd_ps( alphaIv, xv[1], yv[1] );
-			yv[2] = _mm256_fmadd_ps( alphaIv, xv[2], yv[2] );
+			iv[0] = _mm256_fmadd_ps( alphaIv, xv[0], iv[0] );
+			iv[1] = _mm256_fmadd_ps( alphaIv, xv[1], iv[1] );
+			iv[2] = _mm256_fmadd_ps( alphaIv, xv[2], iv[2] );
+
+			yv[0] = _mm256_add_ps( yv[0], iv[0] );
+			yv[1] = _mm256_add_ps( yv[1], iv[1] );
+			yv[2] = _mm256_add_ps( yv[2], iv[2] );
 
 			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), yv[0] );
 			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), yv[1] );
@@ -610,8 +619,11 @@ void bli_caxpbyv_zen_int
 
 			// yv = alphaIv * xv + yv
 			//    = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ...
-			yv[0] = _mm256_fmadd_ps( alphaIv, xv[0], yv[0] );
-			yv[1] = _mm256_fmadd_ps( alphaIv, xv[1], yv[1] );
+			iv[0] = _mm256_fmadd_ps( alphaIv, xv[0], iv[0] );
+			iv[1] = _mm256_fmadd_ps( alphaIv, xv[1], iv[1] );
+
+			yv[0] = _mm256_add_ps( yv[0], iv[0] );
+			yv[1] = _mm256_add_ps( yv[1], iv[1] );
 
 			_mm256_storeu_ps( (y0 + 0*n_elem_per_reg), yv[0] );
 			_mm256_storeu_ps( (y0 + 1*n_elem_per_reg), yv[1] );
@@ -622,8 +634,8 @@ void bli_caxpbyv_zen_int
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();
 
@@ -631,9 +643,12 @@ void bli_caxpbyv_zen_int
 		{
 			for ( ; i < n ; ++i )
 			{
-				*y0       = ( betaR  * (*y0) ) - ( betaI  * (*(y0 + 1)) ) + 
+				const float yRc = *y0;
+				const float yIc = *( y0 + 1 );
+
+				*y0       = ( betaR  * yRc ) - ( betaI  * yIc ) +
 							( alphaR * (*x0) ) - ( alphaI * (*(x0 + 1)) );
-				*(y0 + 1) = ( betaR  * (*(y0 + 1)) ) + ( betaI  * (*y0) ) + 
+				*(y0 + 1) = ( betaR  * yIc ) + ( betaI  * yRc ) +
 							( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) );
 
 				x0 += 2;
@@ -644,9 +659,12 @@ void bli_caxpbyv_zen_int
 		{
 			for ( ; i < n ; ++i )
 			{
-				*y0       = ( betaR  * (*y0) ) - ( betaI  * (*(y0 + 1)) ) + 
+				const float yRc = *y0;
+				const float yIc = *( y0 + 1 );
+
+				*y0       = ( betaR  * yRc ) - ( betaI  * yIc ) +
 							( alphaR * (*x0) ) + ( alphaI * (*(x0 + 1)) );
-				*(y0 + 1) = ( betaR  * (*(y0 + 1)) ) + ( betaI  * (*y0) ) - 
+				*(y0 + 1) = ( betaR  * yIc ) + ( betaI  * yRc ) -
 							( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) );
 
 				x0 += 2;
@@ -661,11 +679,14 @@ void bli_caxpbyv_zen_int
 		{
 			for ( i = 0; i < n ; ++i )
 			{
+				const float yRc = *y0;
+				const float yIc = *( y0 + 1 );
+
 				// yReal = ( bR.yR - bI.yI + aR.xR - aI.xI )
-				*y0       = ( betaR  * (*y0) ) - ( betaI  * (*(y0 + 1)) ) +
+				*y0       = ( betaR  * yRc ) - ( betaI  * yIc ) +
 							( alphaR * (*x0) ) - ( alphaI * (*(x0 + 1)) );
 				// yImag = ( bR.yI + bI.yR + aR.xI + aI.xR )
-				*(y0 + 1) = ( betaR  * (*(y0 + 1)) ) + ( betaI  * (*y0) ) +
+				*(y0 + 1) = ( betaR  * yIc ) + ( betaI  * yRc ) +
 							( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) );
 
 				x0 += incx * 2;
@@ -676,11 +697,14 @@ void bli_caxpbyv_zen_int
 		{
 			for ( i = 0; i < n ; ++i )
 			{
+				const float yRc = *y0;
+				const float yIc = *( y0 + 1 );
+
 				// yReal = ( bR.yR - bI.yI + aR.xR - aI.xI )
-				*y0       = ( betaR  * (*y0) ) - ( betaI  * (*(y0 + 1)) ) +
+				*y0       = ( betaR  * yRc ) - ( betaI  * yIc ) +
 							( alphaR * (*x0) ) + ( alphaI * (*(x0 + 1)) );
 				// yImag = ( bR.yI + bI.yR + aR.xI + aI.xR )
-				*(y0 + 1) = ( betaR  * (*(y0 + 1)) ) + ( betaI  * (*y0) ) -
+				*(y0 + 1) = ( betaR  * yIc ) + ( betaI  * yRc ) -
 							( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) );
 
 				x0 += incx * 2;
@@ -694,7 +718,7 @@ void bli_caxpbyv_zen_int
 /**
  * zaxpbyv kernel performs the axpbyv operation.
  * y := beta * y + alpha * conjx(x)
- * where, 
+ * where,
  * 		x & y are double complex vectors of length n.
  * 		alpha & beta are scalers.
  */
@@ -731,7 +755,7 @@ void bli_zaxpbyv_zen_int
 	
 	/* if the vector dimension is zero, or if alpha & beta are zero,
 	   return early. */
-	if ( bli_zero_dim1( n ) || 
+	if ( bli_zero_dim1( n ) ||
 		 ( PASTEMAC( c, eq0 )( *alpha ) && PASTEMAC( c, eq0 )( *beta ) ) )
 	{
 		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
@@ -753,7 +777,7 @@ void bli_zaxpbyv_zen_int
 		// y = beta*y + alpha*x
 		// y = ( bR + ibI ) * ( yR + iyI ) + ( aR + iaI ) * ( xR + ixI )
 		// y = bR.yR + ibR.yI + ibI.yR - ibIyI + aR.xR + iaR.xI + iaI.xR - aI.xI
-		// y = 	 ( bR.yR - bI.yI + aR.xR - aI.xI ) + 
+		// y = 	 ( bR.yR - bI.yI + aR.xR - aI.xI ) +
 		//	   i ( bR.yI + bI.yR + aR.xI + aI.xR )
 
 		// SIMD Algorithm BLIS_NO_CONJUGATE
@@ -761,10 +785,10 @@ void bli_zaxpbyv_zen_int
 		// yv' =  yI1  yR1  yI2  yR2
 		// xv  =  xR1  xI1  xR2  xI2
 		// xv' =  xI1  xR1  xI2  xR2
-		// arv =  aR   aR   aR   aR 
-		// aiv = -aI   aI  -aI   aI 
-		// brv =  bR   bR   bR   bR 
-		// biv = -bI   bI  -bI   bI 
+		// arv =  aR   aR   aR   aR
+		// aiv = -aI   aI  -aI   aI
+		// brv =  bR   bR   bR   bR
+		// biv = -bI   bI  -bI   bI
 		//
 		// step 1: iv = brv * iv
 		// step 2: shuffle yv -> yv'
@@ -785,10 +809,10 @@ void bli_zaxpbyv_zen_int
 		// yv' =  yI1  yR1  yI2  yR2
 		// xv  =  xR1  xI1  xR2  xI2
 		// xv' =  xI1  xR1  xI2  xR2
-		// arv =  aR  -aR   aR  -aR 
-		// aiv =  aI   aI   aI   aI 
-		// brv =  bR   bR   bR   bR 
-		// biv = -bI   bI  -bI   bI 
+		// arv =  aR  -aR   aR  -aR
+		// aiv =  aI   aI   aI   aI
+		// brv =  bR   bR   bR   bR
+		// biv = -bI   bI  -bI   bI
 		//
 		// step 1: iv = brv * iv
 		// step 2: shuffle yv -> yv'
@@ -871,10 +895,15 @@ void bli_zaxpbyv_zen_int
 
 			// yv = alphaIv * xv + yv
 			//    = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ...
-			yv[0] = _mm256_fmadd_pd( alphaIv, xv[0], yv[0] );
-			yv[1] = _mm256_fmadd_pd( alphaIv, xv[1], yv[1] );
-			yv[2] = _mm256_fmadd_pd( alphaIv, xv[2], yv[2] );
-			yv[3] = _mm256_fmadd_pd( alphaIv, xv[3], yv[3] );
+			iv[0] = _mm256_fmadd_pd( alphaIv, xv[0], iv[0] );
+			iv[1] = _mm256_fmadd_pd( alphaIv, xv[1], iv[1] );
+			iv[2] = _mm256_fmadd_pd( alphaIv, xv[2], iv[2] );
+			iv[3] = _mm256_fmadd_pd( alphaIv, xv[3], iv[3] );
+
+			yv[0] = _mm256_add_pd( yv[0], iv[0] );
+			yv[1] = _mm256_add_pd( yv[1], iv[1] );
+			yv[2] = _mm256_add_pd( yv[2], iv[2] );
+			yv[3] = _mm256_add_pd( yv[3], iv[3] );
 
 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] );
 			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), yv[1] );
@@ -928,9 +957,13 @@ void bli_zaxpbyv_zen_int
 
 			// yv = alphaIv * xv + yv
 			//    = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ...
-			yv[0] = _mm256_fmadd_pd( alphaIv, xv[0], yv[0] );
-			yv[1] = _mm256_fmadd_pd( alphaIv, xv[1], yv[1] );
-			yv[2] = _mm256_fmadd_pd( alphaIv, xv[2], yv[2] );
+			iv[0] = _mm256_fmadd_pd( alphaIv, xv[0], iv[0] );
+			iv[1] = _mm256_fmadd_pd( alphaIv, xv[1], iv[1] );
+			iv[2] = _mm256_fmadd_pd( alphaIv, xv[2], iv[2] );
+
+			yv[0] = _mm256_add_pd( yv[0], iv[0] );
+			yv[1] = _mm256_add_pd( yv[1], iv[1] );
+			yv[2] = _mm256_add_pd( yv[2], iv[2] );
 
 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] );
 			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), yv[1] );
@@ -976,8 +1009,11 @@ void bli_zaxpbyv_zen_int
 
 			// yv = alphaIv * xv + yv
 			//    = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ...
-			yv[0] = _mm256_fmadd_pd( alphaIv, xv[0], yv[0] );
-			yv[1] = _mm256_fmadd_pd( alphaIv, xv[1], yv[1] );
+			iv[0] = _mm256_fmadd_pd( alphaIv, xv[0], iv[0] );
+			iv[1] = _mm256_fmadd_pd( alphaIv, xv[1], iv[1] );
+
+			yv[0] = _mm256_add_pd( yv[0], iv[0] );
+			yv[1] = _mm256_add_pd( yv[1], iv[1] );
 
 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] );
 			_mm256_storeu_pd( (y0 + 1*n_elem_per_reg), yv[1] );
@@ -1015,7 +1051,9 @@ void bli_zaxpbyv_zen_int
 
 			// yv = alphaIv * xv + yv
 			//    = yR1.bR - yR1.bI - xR1.aI, yI1.bR + yI1.bI + xI1.aI, ...
-			yv[0] = _mm256_fmadd_pd( alphaIv, xv[0], yv[0] );
+			iv[0] = _mm256_fmadd_pd( alphaIv, xv[0], iv[0] );
+
+			yv[0] = _mm256_add_pd( yv[0], iv[0] );
 
 			_mm256_storeu_pd( (y0 + 0*n_elem_per_reg), yv[0] );
 
@@ -1025,8 +1063,8 @@ void bli_zaxpbyv_zen_int
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();
 
@@ -1034,11 +1072,14 @@ void bli_zaxpbyv_zen_int
 		{
 			for ( ; i < n ; ++i )
 			{
+				const double yRc = *y0;
+				const double yIc = *( y0 + 1 );
+
 				// yReal  = ( bR.yR - bI.yI + aR.xR - aI.xI )
-				*y0       = ( betaR  * (*y0) ) - ( betaI  * (*(y0 + 1)) ) +
+				*y0       = ( betaR  * yRc ) - ( betaI  * yIc ) +
 							( alphaR * (*x0) ) - ( alphaI * (*(x0 + 1)) );
 				// yImag  = ( bR.yI + bI.yR + aR.xI + aI.xR )
-				*(y0 + 1) = ( betaR  * (*(y0 + 1)) ) + ( betaI  * (*y0) ) +
+				*(y0 + 1) = ( betaR  * yIc ) + ( betaI  * yRc ) +
 							( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) );
 
 				x0 += 2;
@@ -1049,11 +1090,14 @@ void bli_zaxpbyv_zen_int
 		{
 			for ( ; i < n ; ++i )
 			{
+				const double yRc = *y0;
+				const double yIc = *( y0 + 1 );
+
 				// yReal  = ( bR.yR - bI.yI + aR.xR - aI.xI )
-				*y0       = ( betaR  * (*y0) ) - ( betaI  * (*(y0 + 1)) ) +
+				*y0       = ( betaR  * yRc ) - ( betaI  * yIc ) +
 							( alphaR * (*x0) ) + ( alphaI * (*(x0 + 1)) );
 				// yImag  = ( bR.yI + bI.yR + aR.xI + aI.xR )
-				*(y0 + 1) = ( betaR  * (*(y0 + 1)) ) + ( betaI  * (*y0) ) -
+				*(y0 + 1) = ( betaR  * yIc ) + ( betaI  * yRc ) -
 							( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) );
 
 				x0 += 2;
@@ -1068,11 +1112,14 @@ void bli_zaxpbyv_zen_int
 		{
 			for ( i = 0; i < n ; ++i )
 			{
+				const double yRc = *y0;
+				const double yIc = *( y0 + 1 );
+
 				// yReal  = ( bR.yR - bI.yI + aR.xR - aI.xI )
-				*y0       = ( betaR  * (*y0) ) - ( betaI  * (*(y0 + 1)) ) +
+				*y0       = ( betaR  * yRc ) - ( betaI  * yIc ) +
 							( alphaR * (*x0) ) - ( alphaI * (*(x0 + 1)) );
 				// yImag  = ( bR.yI + bI.yR + aR.xI + aI.xR )
-				*(y0 + 1) = ( betaR  * (*(y0 + 1)) ) + ( betaI  * (*y0) ) +
+				*(y0 + 1) = ( betaR  * yIc ) + ( betaI  * yRc ) +
 							( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) );
 
 				x0 += incx * 2;
@@ -1083,11 +1130,14 @@ void bli_zaxpbyv_zen_int
 		{
 			for ( i = 0; i < n ; ++i )
 			{
+				const double yRc = *y0;
+				const double yIc = *( y0 + 1 );
+
 				// yReal  = ( bR.yR - bI.yI + aR.xR - aI.xI )
-				*y0       = ( betaR  * (*y0) ) - ( betaI  * (*(y0 + 1)) ) +
+				*y0       = ( betaR  * yRc ) - ( betaI  * yIc ) +
 							( alphaR * (*x0) ) + ( alphaI * (*(x0 + 1)) );
 				// yImag  = ( bR.yI + bI.yR + aR.xI + aI.xR )
-				*(y0 + 1) = ( betaR  * (*(y0 + 1)) ) + ( betaI  * (*y0) ) -
+				*(y0 + 1) = ( betaR  * yIc ) + ( betaI  * yRc ) -
 							( alphaR * (*(x0 + 1)) ) + ( alphaI * (*x0) );
 
 				x0 += incx * 2;
@@ -1096,4 +1146,4 @@ void bli_zaxpbyv_zen_int
 		}
 	}
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
-}
\ No newline at end of file
+}
diff --git a/kernels/zen/1/bli_axpbyv_zen_int10.c b/kernels/zen/1/bli_axpbyv_zen_int10.c
index bbfdaf0d6a..787f325ba3 100644
--- a/kernels/zen/1/bli_axpbyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpbyv_zen_int10.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -374,8 +374,8 @@ void bli_saxpbyv_zen_int10
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();
 
@@ -680,8 +680,8 @@ void bli_daxpbyv_zen_int10
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// as soon as the n_left cleanup loop below if BLIS is compiled with
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
 		// -mfpmath=sse).
 		_mm256_zeroupper();
 
@@ -706,4 +706,4 @@ void bli_daxpbyv_zen_int10
 		}
 	}
 	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
-}
\ No newline at end of file
+}
diff --git a/kernels/zen/1/bli_axpyv_zen_int.c b/kernels/zen/1/bli_axpyv_zen_int.c
index 686580b290..2b1a738da7 100644
--- a/kernels/zen/1/bli_axpyv_zen_int.c
+++ b/kernels/zen/1/bli_axpyv_zen_int.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2018, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
@@ -138,8 +138,8 @@ void bli_saxpyv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();
 
@@ -242,8 +242,8 @@ void bli_daxpyv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();
 
diff --git a/kernels/zen/1/bli_axpyv_zen_int10.c b/kernels/zen/1/bli_axpyv_zen_int10.c
index 4ef6981cd7..cc52b3dff7 100644
--- a/kernels/zen/1/bli_axpyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpyv_zen_int10.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2016 - 2023, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2018 - 2020, The University of Texas at Austin. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
@@ -307,9 +307,10 @@ void bli_saxpyv_zen_int10
 
         // Issue vzeroupper instruction to clear upper lanes of ymm registers.
         // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
         // -mfpmath=sse).
+
         _mm256_zeroupper();
 
         for ( ; (i + 0) < n; i += 1 )
@@ -583,8 +584,8 @@ void bli_daxpyv_zen_int10
 
         // Issue vzeroupper instruction to clear upper lanes of ymm registers.
         // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
         // -mfpmath=sse).
         _mm256_zeroupper();
 
@@ -638,8 +639,8 @@ void bli_caxpyv_zen_int5
     float alphaR, alphaI;
 
     //scomplex alpha => aR + aI i
-    __m256           alphaRv;            // for braodcast vector aR (real part of alpha)
-    __m256           alphaIv;            // for braodcast vector aI (imaginary part of alpha)
+    __m256           alphaRv;            // for broadcast vector aR (real part of alpha)
+    __m256           alphaIv;            // for broadcast vector aI (imaginary part of alpha)
     __m256           xv[10];
     __m256           xShufv[10];
     __m256           yv[10];
@@ -837,8 +838,8 @@ void bli_caxpyv_zen_int5
 
         // Issue vzeroupper instruction to clear upper lanes of ymm registers.
         // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
         // -mfpmath=sse).
         _mm256_zeroupper();
 
@@ -927,58 +928,57 @@ void bli_zaxpyv_zen_int5
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_4)
 
-    const dim_t      n_elem_per_reg = 4;
-
-    dim_t            i;
-
-    double*  restrict x0;
-    double*  restrict y0;
-    double*  restrict alpha0;
-
-    double alphaR, alphaI;
-
-    __m256d           alphaRv;            // for braodcast vector aR (real part of alpha)
-    __m256d           alphaIv;            // for braodcast vector aI (imaginary part of alpha)
-    __m256d           xv[5];
-    __m256d           xShufv[5];
-    __m256d           yv[5];
-
-    conj_t conjx_use = conjx;
-
-     // If the vector dimension is zero, or if alpha is zero, return early.
+    // If the vector dimension is zero, or if alpha is zero, return early.
     if ( bli_zero_dim1( n ) || PASTEMAC(z,eq0)( *alpha ) )
     {
          AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
          return;
     }
 
+    dim_t            i = 0;
+
     // Initialize local pointers.
-    x0 = (double*)x;
-    y0 = (double*)y;
-    alpha0 = (double*)alpha;
+    double* x0 = (double*)x;
+    double* y0 = (double*)y;
 
-    alphaR = alpha->real;
-    alphaI = alpha->imag;
+    double alphaR = alpha->real;
+    double alphaI = alpha->imag;
 
     if ( incx == 1 && incy == 1 )
     {
+        const dim_t n_elem_per_reg = 4;
+
+        __m256d alphaRv; // for broadcast vector aR (real part of alpha)
+        __m256d alphaIv; // for broadcast vector aI (imaginary part of alpha)
+        __m256d xv[7]; // Holds the X vector elements
+        __m256d xShufv[5]; // Holds the permuted X vector elements
+        __m256d yv[7]; // Holds the y vector elements
+
+        // Prefetch distance used in the kernel based on number of cycles
+        // In this case, 16 cycles
+        const dim_t distance = 16;
+
+        // Prefetch X vector to the L1 cache
+        // as these elements will be need anyway
+        _mm_prefetch(x0, _MM_HINT_T1);
+
         // Broadcast the alpha scalar to all elements of a vector register.
-        if ( !bli_is_conj (conjx) ) // If BLIS_NO_CONJUGATE
+        if (bli_is_noconj(conjx)) // If BLIS_NO_CONJUGATE
         {
-            alphaRv      = _mm256_broadcast_sd( &alphaR );
+            alphaRv = _mm256_broadcast_sd(&alphaR);
 
             alphaIv[0] = -alphaI;
-            alphaIv[1] =  alphaI;
+            alphaIv[1] = alphaI;
             alphaIv[2] = -alphaI;
-            alphaIv[3] =  alphaI;
+            alphaIv[3] = alphaI;
         }
         else
         {
-            alphaIv = _mm256_broadcast_sd( &alphaI );
+            alphaIv = _mm256_broadcast_sd(&alphaI);
 
-            alphaRv[0] =  alphaR;
+            alphaRv[0] = alphaR;
             alphaRv[1] = -alphaR;
-            alphaRv[2] =  alphaR;
+            alphaRv[2] = alphaR;
             alphaRv[3] = -alphaR;
         }
 
@@ -1023,7 +1023,77 @@ void bli_zaxpyv_zen_int5
         // step 3 : fma :yv = ai*xv' + yv (old)
         //               yv = ai*xv' + ar*xv + yv
 
-        for ( i = 0; (i + 9) < n; i += 10 )
+        for (i = 0; (i + 13) < n; i += 14)
+        {
+            // 14 elements will be processed per loop; 14 FMAs will run per loop.
+
+            // alphaRv = aR   aR   aR   aR
+            // xv      = xR1  xI1  xR2  xI2
+            xv[0] = _mm256_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm256_loadu_pd(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm256_loadu_pd(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm256_loadu_pd(x0 + 3 * n_elem_per_reg);
+            xv[4] = _mm256_loadu_pd(x0 + 4 * n_elem_per_reg);
+            xv[5] = _mm256_loadu_pd(x0 + 5 * n_elem_per_reg);
+            xv[6] = _mm256_loadu_pd(x0 + 6 * n_elem_per_reg);
+
+            // yv    =  yR1  yI1  yR2  yI2
+            yv[0] = _mm256_loadu_pd(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm256_loadu_pd(y0 + 1 * n_elem_per_reg);
+            yv[2] = _mm256_loadu_pd(y0 + 2 * n_elem_per_reg);
+            yv[3] = _mm256_loadu_pd(y0 + 3 * n_elem_per_reg);
+            yv[4] = _mm256_loadu_pd(y0 + 4 * n_elem_per_reg);
+            yv[5] = _mm256_loadu_pd(y0 + 5 * n_elem_per_reg);
+            yv[6] = _mm256_loadu_pd(y0 + 6 * n_elem_per_reg);
+
+            // yv  =  ar*xv + yv
+            //     =  aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ...
+            yv[0] = _mm256_fmadd_pd(xv[0], alphaRv, yv[0]);
+            yv[1] = _mm256_fmadd_pd(xv[1], alphaRv, yv[1]);
+            yv[2] = _mm256_fmadd_pd(xv[2], alphaRv, yv[2]);
+            yv[3] = _mm256_fmadd_pd(xv[3], alphaRv, yv[3]);
+            yv[4] = _mm256_fmadd_pd(xv[4], alphaRv, yv[4]);
+            yv[5] = _mm256_fmadd_pd(xv[5], alphaRv, yv[5]);
+            yv[6] = _mm256_fmadd_pd(xv[6], alphaRv, yv[6]);
+
+            // xv'   =  xI1  xRI  xI2  xR2
+            xv[0] = _mm256_permute_pd(xv[0], 5);
+            xv[1] = _mm256_permute_pd(xv[1], 5);
+            xv[2] = _mm256_permute_pd(xv[2], 5);
+            xv[3] = _mm256_permute_pd(xv[3], 5);
+            xv[4] = _mm256_permute_pd(xv[4], 5);
+            xv[5] = _mm256_permute_pd(xv[5], 5);
+            xv[6] = _mm256_permute_pd(xv[6], 5);
+
+            // Prefetch X and Y vectors to the L1 cache
+            _mm_prefetch(x0 + distance, _MM_HINT_T1);
+            _mm_prefetch(y0 + distance, _MM_HINT_T1);
+            // alphaIv = -aI   aI  -aI   aI
+
+            // yv  =  ar*xv + yv
+            //     =  aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ...
+            yv[0] = _mm256_fmadd_pd(xv[0], alphaIv, yv[0]);
+            yv[1] = _mm256_fmadd_pd(xv[1], alphaIv, yv[1]);
+            yv[2] = _mm256_fmadd_pd(xv[2], alphaIv, yv[2]);
+            yv[3] = _mm256_fmadd_pd(xv[3], alphaIv, yv[3]);
+            yv[4] = _mm256_fmadd_pd(xv[4], alphaIv, yv[4]);
+            yv[5] = _mm256_fmadd_pd(xv[5], alphaIv, yv[5]);
+            yv[6] = _mm256_fmadd_pd(xv[6], alphaIv, yv[6]);
+
+            // Store back the result
+            _mm256_storeu_pd((y0 + 0 * n_elem_per_reg), yv[0]);
+            _mm256_storeu_pd((y0 + 1 * n_elem_per_reg), yv[1]);
+            _mm256_storeu_pd((y0 + 2 * n_elem_per_reg), yv[2]);
+            _mm256_storeu_pd((y0 + 3 * n_elem_per_reg), yv[3]);
+            _mm256_storeu_pd((y0 + 4 * n_elem_per_reg), yv[4]);
+            _mm256_storeu_pd((y0 + 5 * n_elem_per_reg), yv[5]);
+            _mm256_storeu_pd((y0 + 6 * n_elem_per_reg), yv[6]);
+
+            x0 += 7 * n_elem_per_reg;
+            y0 += 7 * n_elem_per_reg;
+        }
+
+        for ( ; (i + 9) < n; i += 10 )
         {
             // 10 elements will be processed per loop; 10 FMAs will run per loop.
 
@@ -1079,6 +1149,48 @@ void bli_zaxpyv_zen_int5
             y0 += 5*n_elem_per_reg;
         }
 
+        for (; (i + 5) < n; i += 6)
+        {
+            // alphaRv = aR   aR   aR   aR
+            // xv      = xR1  xI1  xR2  xI2
+            xv[0] = _mm256_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm256_loadu_pd(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm256_loadu_pd(x0 + 2 * n_elem_per_reg);
+
+            // yv    =  yR1  yI1  yR2  yI2
+            yv[0] = _mm256_loadu_pd(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm256_loadu_pd(y0 + 1 * n_elem_per_reg);
+            yv[2] = _mm256_loadu_pd(y0 + 2 * n_elem_per_reg);
+
+            // xv'   =  xI1  xRI  xI2  xR2
+            xShufv[0] = _mm256_permute_pd(xv[0], 5);
+            xShufv[1] = _mm256_permute_pd(xv[1], 5);
+            xShufv[2] = _mm256_permute_pd(xv[2], 5);
+
+            // alphaIv = -aI   aI  -aI   aI
+
+            // yv  =  ar*xv + yv
+            //     =  aR.xR1 + yR1, aR.xI1 + yI1, aR.xR2 + yR2, aR.xI2 + yI2, ...
+            yv[0] = _mm256_fmadd_pd(xv[0], alphaRv, yv[0]);
+            yv[1] = _mm256_fmadd_pd(xv[1], alphaRv, yv[1]);
+            yv[2] = _mm256_fmadd_pd(xv[2], alphaRv, yv[2]);
+
+            // yv =  ai*xv' + yv (old)
+            // yv =  ai*xv' + ar*xv + yv
+            //    = -aI*xI1 + aR.xR1 + yR1, aI.xR1 + aR.xI1 + yI1, .........
+            yv[0] = _mm256_fmadd_pd(xShufv[0], alphaIv, yv[0]);
+            yv[1] = _mm256_fmadd_pd(xShufv[1], alphaIv, yv[1]);
+            yv[2] = _mm256_fmadd_pd(xShufv[2], alphaIv, yv[2]);
+
+            // Store back the result
+            _mm256_storeu_pd((y0 + 0 * n_elem_per_reg), yv[0]);
+            _mm256_storeu_pd((y0 + 1 * n_elem_per_reg), yv[1]);
+            _mm256_storeu_pd((y0 + 2 * n_elem_per_reg), yv[2]);
+
+            x0 += 3 * n_elem_per_reg;
+            y0 += 3 * n_elem_per_reg;
+        }
+
         for ( ; (i + 3) < n; i += 4 )
         {
             // alphaRv = aR   aR   aR   aR
@@ -1115,7 +1227,7 @@ void bli_zaxpyv_zen_int5
             y0 += 2*n_elem_per_reg;
         }
 
-        for (  ; (i + 3) < n; i += 2 )
+        for (  ; (i + 1) < n; i += 2 )
         {
             // alphaRv = aR   aR   aR   aR
             // xv      = xR1  xI1  xR2  xI2
@@ -1147,76 +1259,49 @@ void bli_zaxpyv_zen_int5
 
         // Issue vzeroupper instruction to clear upper lanes of ymm registers.
         // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
         // -mfpmath=sse).
         _mm256_zeroupper();
+    }
 
-        /* Residual values are calculated here
-        y0 += (alpha) * (x0); --> BLIS_NO_CONJUGATE
-        y0 += ( aR.xR - aIxI + yR ) + ( aR.xI + aI.xR + yI )i
+    __m128d alpha_r, alpha_i, x_vec, y_vec;
 
-        y0 += (alpha) * conjx(x0); --> BLIS_CONJUGATE
-        y0 = ( aR.xR + aIxI + yR ) + (aI.xR - aR.xI + yI)i */
+    // Broadcast the alpha scalar to all elements of a vector register.
+    if (bli_is_noconj(conjx)) // If BLIS_NO_CONJUGATE
+    {
+        alpha_r = _mm_set1_pd(alphaR);
 
-        if ( !bli_is_conj(conjx_use) ) //  BLIS_NO_CONJUGATE
-        {
-            for ( ; (i + 0) < n; i += 1 )
-            {
-                // real part: ( aR.xR - aIxI + yR )
-                *y0       += *alpha0 * (*x0) - (*(alpha0 + 1)) * (*(x0+1));
-                // img part: ( aR.xI + aI.xR + yI )
-                *(y0 + 1) += *alpha0 * (*(x0+1)) + (*(alpha0 + 1)) * (*x0);
-                x0 += 2;
-                y0 += 2;
-            }
-        }
-        else //  BLIS_CONJUGATE
-        {
-            for ( ; (i + 0) < n; i += 1 )
-            {
-                // real part: ( aR.xR + aIxI + yR )
-                *y0       += *alpha0 * (*x0) + (*(alpha0 + 1)) * (*(x0+1));
-                // img part: (  aI.xR - aR.xI + yI )
-                *(y0 + 1) += (*(alpha0 + 1)) * (*x0) - (*alpha0) * (*(x0+1));
-                x0 += 2;
-                y0 += 2;
-            }
-        }
+        alpha_i[0] = -alphaI;
+        alpha_i[1] = alphaI;
     }
     else
     {
-        const double alphar = *alpha0;
-        const double alphai = *(alpha0 + 1);
+        alpha_i = _mm_set1_pd(alphaI);
 
-        if ( !bli_is_conj(conjx_use) ) //  BLIS_NO_CONJUGATE
-        {
-            for ( i = 0; i < n; ++i )
-            {
-                const double x0c = *x0;
-                const double x1c = *( x0+1 );
+        alpha_r[0] = alphaR;
+        alpha_r[1] = -alphaR;
+    }
 
-                *y0         += alphar * x0c - alphai * x1c;
-                *(y0 + 1)   += alphar * x1c + alphai * x0c;
+    /* This loop has two functions:
+        1. Acts as the the fringe case when incx == 1 and incy == 1
+        2. Performs the complete computation when incx != 1 or incy != 1
+    */
+    for (; i < n; ++i)
+    {
 
-                x0 += incx * 2;
-                y0 += incy * 2;
-            }
-        }
-        else //  BLIS_CONJUGATE
-        {
-            for ( i = 0; i < n; ++i )
-            {
-                const double x0c = *x0;
-                const double x1c = *( x0+1 );
+        x_vec = _mm_loadu_pd(x0);
+        y_vec = _mm_loadu_pd(y0);
 
-                *y0         += alphar * x0c + alphai * x1c;
-                *(y0 + 1)   += alphai * x0c - alphar * x1c;
+        y_vec = _mm_fmadd_pd(x_vec, alpha_r, y_vec);
+        x_vec = _mm_permute_pd(x_vec, 0b01);
+        y_vec = _mm_fmadd_pd(x_vec, alpha_i, y_vec);
 
-                x0 += incx * 2;
-                y0 += incy * 2;
-            }
-        }
+        _mm_storeu_pd(y0, y_vec);
+
+        x0 += incx * 2;
+        y0 += incy * 2;
     }
+
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
 }
diff --git a/kernels/zen/1/bli_copyv_zen_int.c b/kernels/zen/1/bli_copyv_zen_int.c
index f429a939f2..9ffde188e8 100644
--- a/kernels/zen/1/bli_copyv_zen_int.c
+++ b/kernels/zen/1/bli_copyv_zen_int.c
@@ -1,343 +1,660 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-	- Redistributions of source code must retain the above copyright
-	  notice, this list of conditions and the following disclaimer.
-	- Redistributions in binary form must reproduce the above copyright
-	  notice, this list of conditions and the following disclaimer in the
-	  documentation and/or other materials provided with the distribution.
-	- Neither the name(s) of the copyright holder(s) nor the names of its
-	  contributors may be used to endorse or promote products derived
-	  from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "immintrin.h"
-#include "blis.h"
-
-// -----------------------------------------------------------------------------
-
-void bli_scopyv_zen_int
-(
-	conj_t           conjx,
-	dim_t            n,
-	float*  restrict x, inc_t incx,
-	float*  restrict y, inc_t incy,
-	cntx_t* restrict cntx
-)
-{
-	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2)
-
-	const dim_t   num_elem_per_reg = 8;
-	__m256  xv[16];
-	dim_t i = 0;
-
-	// If the vector dimension is zero return early.
-	if (bli_zero_dim1(n))
-	{
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
-		return;
-	}
-
-	if (incx == 1 && incy == 1)
-	{
-#if 0
-	  PRAGMA_SIMD
-	  for (i = 0; i < n; i++)
-	  {
-	    y[i] = x[i];
-	  }
-#endif
-#if 0
-        memcpy(y, x, n << 2);
-#endif
-#if 1
-
-		// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
-		// for example if n = 255
-		// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
-		// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
-		// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
-		for (i = 0; i < (n & (~0x7F)); i += 128)
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
-			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
-			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
-			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
-			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
-			xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
-			xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
-			xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
-			xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
-			xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
-			xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
-			xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
-			xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
-
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
-
-			y += 128;
-			x += 128;
-		}
-		for (; i < (n & (~0x3F)); i += 64)
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
-			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
-			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
-			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
-			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
-
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
-
-			y += 64;
-			x += 64;
-		}
-		for (; i < (n & (~0x1F)); i += 32)
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
-
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
-
-			y += 32;
-			x += 32;
-		}
-		for (; i < (n & (~0x0F)); i += 16)
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
-
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
-
-			y += 16;
-			x += 16;
-		}
-		for (; i < (n & (~0x07)); i += 8)
-		{
-			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
-			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
-			y += 8;
-			x += 8;
-		}
-		for (; i < n; i++)
-		{
-			*y++ = *x++;
-		}
-#endif
-	}
-	else
-	{
-		for (dim_t i = 0; i < n; ++i)
-		{
-			*y = *x;
-			x += incx;
-			y += incy;
-		}
-	}
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_dcopyv_zen_int
-(
-	conj_t           conjx,
-	dim_t            n,
-	double*  restrict x, inc_t incx,
-	double*  restrict y, inc_t incy,
-	cntx_t* restrict cntx
-)
-{
-	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2)
-	const dim_t      num_elem_per_reg = 4;
-	__m256d  xv[16];
-	dim_t i = 0;
-
-	// If the vector dimension is zero return early.
-	if (bli_zero_dim1(n))
-	{
-		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
-		return;
-	}
-
-	if (incx == 1 && incy == 1)
-	{
-#if 0
-		PRAGMA_SIMD
-			for (i = 0; i < n; ++i)
-			{
-				y[i] = x[i];
-			}
-#endif
-#if 0
-		memcpy(y, x, n << 3);
-#endif
-#if 1
-		// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
-		// the copy operation will be done for the multiples of 64
-		for (i = 0; i < (n & (~0x3F)); i += 64)
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
-			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
-			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
-			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
-			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
-			xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
-			xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
-			xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
-			xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
-			xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
-			xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
-			xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
-			xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
-			y += num_elem_per_reg * 16;
-			x += num_elem_per_reg * 16;
-		}
-		for (; i < (n & (~0x1F)); i += 32)
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
-			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
-			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
-			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
-			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
-
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
-
-			y += num_elem_per_reg * 8;
-			x += num_elem_per_reg * 8;
-		}
-		for (; i < (n & (~0xF)); i += 16)
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
-			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
-			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
-
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
-
-			y += num_elem_per_reg * 4;
-			x += num_elem_per_reg * 4;
-		}
-		for (; i < (n & (~0x07)); i += 8)
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
-
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
-
-			y += num_elem_per_reg * 2;
-			x += num_elem_per_reg * 2;
-		}
-		for (; i < (n & (~0x03)); i += 4)
-		{
-			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
-			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
-			y += num_elem_per_reg;
-			x += num_elem_per_reg;
-		}
-		for (; i < n; i++)
-		{
-			*y++ = *x++;
-		}
-#endif	
-	}
-	else
-	{
-		for ( i = 0; i < n; ++i)
-		{
-			*y = *x;
-
-			x += incx;
-			y += incy;
-		}
-	}
-	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
-}
-
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2019-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+// -----------------------------------------------------------------------------
+
+void bli_scopyv_zen_int
+(
+	conj_t           conjx,
+	dim_t            n,
+	float*  restrict x, inc_t incx,
+	float*  restrict y, inc_t incy,
+	cntx_t* restrict cntx
+)
+{
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2)
+
+	const dim_t   num_elem_per_reg = 8;
+	__m256  xv[16];
+	dim_t i = 0;
+
+	// If the vector dimension is zero return early.
+	if (bli_zero_dim1(n))
+	{
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
+		return;
+	}
+
+	if (incx == 1 && incy == 1)
+	{
+#if 0
+	  PRAGMA_SIMD
+	  for (i = 0; i < n; i++)
+	  {
+	    y[i] = x[i];
+	  }
+#endif
+#if 0
+        memcpy(y, x, n << 2);
+#endif
+#if 1
+
+		// For loop with n & ~0x7F => n & 0xFFFFFF80 masks the lower bits and results in multiples of 128
+		// for example if n = 255
+		// n & ~0x7F results in 128: copy from 0 to 128 happens in first loop
+		// n & ~0x3F results in 192: copy from 128 to 192 happens in second loop
+		// n & ~0x1F results in 224: copy from 128 to 192 happens in third loop and so on.
+		for (i = 0; i < (n & (~0x7F)); i += 128)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
+			xv[8] = _mm256_loadu_ps(x + num_elem_per_reg * 8);
+			xv[9] = _mm256_loadu_ps(x + num_elem_per_reg * 9);
+			xv[10] = _mm256_loadu_ps(x + num_elem_per_reg * 10);
+			xv[11] = _mm256_loadu_ps(x + num_elem_per_reg * 11);
+			xv[12] = _mm256_loadu_ps(x + num_elem_per_reg * 12);
+			xv[13] = _mm256_loadu_ps(x + num_elem_per_reg * 13);
+			xv[14] = _mm256_loadu_ps(x + num_elem_per_reg * 14);
+			xv[15] = _mm256_loadu_ps(x + num_elem_per_reg * 15);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 8, xv[8]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 9, xv[9]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 10, xv[10]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 11, xv[11]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 12, xv[12]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 13, xv[13]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 14, xv[14]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 15, xv[15]);
+
+			y += 128;
+			x += 128;
+		}
+		for (; i < (n & (~0x3F)); i += 64)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_ps(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_ps(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_ps(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_ps(x + num_elem_per_reg * 7);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 7, xv[7]);
+
+			y += 64;
+			x += 64;
+		}
+		for (; i < (n & (~0x1F)); i += 32)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_ps(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_ps(x + num_elem_per_reg * 3);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 3, xv[3]);
+
+			y += 32;
+			x += 32;
+		}
+		for (; i < (n & (~0x0F)); i += 16)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_ps(x + num_elem_per_reg * 1);
+
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_ps(y + num_elem_per_reg * 1, xv[1]);
+
+			y += 16;
+			x += 16;
+		}
+		for (; i < (n & (~0x07)); i += 8)
+		{
+			xv[0] = _mm256_loadu_ps(x + num_elem_per_reg * 0);
+			_mm256_storeu_ps(y + num_elem_per_reg * 0, xv[0]);
+			y += 8;
+			x += 8;
+		}
+		for (; i < n; i++)
+		{
+			*y++ = *x++;
+		}
+#endif
+	}
+	else
+	{
+		for (dim_t i = 0; i < n; ++i)
+		{
+			*y = *x;
+			x += incx;
+			y += incy;
+		}
+	}
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_dcopyv_zen_int
+(
+	conj_t           conjx,
+	dim_t            n,
+	double*  restrict x, inc_t incx,
+	double*  restrict y, inc_t incy,
+	cntx_t* restrict cntx
+)
+{
+	AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_2)
+	const dim_t      num_elem_per_reg = 4;
+	__m256d  xv[16];
+	dim_t i = 0;
+
+	// If the vector dimension is zero return early.
+	if (bli_zero_dim1(n))
+	{
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
+		return;
+	}
+
+	if (incx == 1 && incy == 1)
+	{
+#if 0
+		PRAGMA_SIMD
+			for (i = 0; i < n; ++i)
+			{
+				y[i] = x[i];
+			}
+#endif
+#if 0
+		memcpy(y, x, n << 3);
+#endif
+#if 1
+		// n & (~0x3F) = n & 0xFFFFFFC0 -> this masks the numbers less than 64,
+		// the copy operation will be done for the multiples of 64
+		for (i = 0; i < (n & (~0x3F)); i += 64)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
+			xv[8] = _mm256_loadu_pd(x + num_elem_per_reg * 8);
+			xv[9] = _mm256_loadu_pd(x + num_elem_per_reg * 9);
+			xv[10] = _mm256_loadu_pd(x + num_elem_per_reg * 10);
+			xv[11] = _mm256_loadu_pd(x + num_elem_per_reg * 11);
+			xv[12] = _mm256_loadu_pd(x + num_elem_per_reg * 12);
+			xv[13] = _mm256_loadu_pd(x + num_elem_per_reg * 13);
+			xv[14] = _mm256_loadu_pd(x + num_elem_per_reg * 14);
+			xv[15] = _mm256_loadu_pd(x + num_elem_per_reg * 15);
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 8, xv[8]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 9, xv[9]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 10, xv[10]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 11, xv[11]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 12, xv[12]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 13, xv[13]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 14, xv[14]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 15, xv[15]);
+			y += num_elem_per_reg * 16;
+			x += num_elem_per_reg * 16;
+		}
+		for (; i < (n & (~0x1F)); i += 32)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+			xv[4] = _mm256_loadu_pd(x + num_elem_per_reg * 4);
+			xv[5] = _mm256_loadu_pd(x + num_elem_per_reg * 5);
+			xv[6] = _mm256_loadu_pd(x + num_elem_per_reg * 6);
+			xv[7] = _mm256_loadu_pd(x + num_elem_per_reg * 7);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 4, xv[4]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 5, xv[5]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 6, xv[6]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 7, xv[7]);
+
+			y += num_elem_per_reg * 8;
+			x += num_elem_per_reg * 8;
+		}
+		for (; i < (n & (~0xF)); i += 16)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+			xv[2] = _mm256_loadu_pd(x + num_elem_per_reg * 2);
+			xv[3] = _mm256_loadu_pd(x + num_elem_per_reg * 3);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 2, xv[2]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 3, xv[3]);
+
+			y += num_elem_per_reg * 4;
+			x += num_elem_per_reg * 4;
+		}
+		for (; i < (n & (~0x07)); i += 8)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			xv[1] = _mm256_loadu_pd(x + num_elem_per_reg * 1);
+
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			_mm256_storeu_pd(y + num_elem_per_reg * 1, xv[1]);
+
+			y += num_elem_per_reg * 2;
+			x += num_elem_per_reg * 2;
+		}
+		for (; i < (n & (~0x03)); i += 4)
+		{
+			xv[0] = _mm256_loadu_pd(x + num_elem_per_reg * 0);
+			_mm256_storeu_pd(y + num_elem_per_reg * 0, xv[0]);
+			y += num_elem_per_reg;
+			x += num_elem_per_reg;
+		}
+		for (; i < n; i++)
+		{
+			*y++ = *x++;
+		}
+#endif	
+	}
+	else
+	{
+		for ( i = 0; i < n; ++i)
+		{
+			*y = *x;
+
+			x += incx;
+			y += incy;
+		}
+	}
+	AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
+}
+
+void bli_zcopyv_zen_int
+(
+	conj_t           conjx,
+	dim_t            n,
+	dcomplex*  restrict x, inc_t incx,
+	dcomplex*  restrict y, inc_t incy,
+	cntx_t* restrict cntx
+)
+{
+	// If the vector dimension is zero return early.
+	if (bli_zero_dim1(n))
+	{
+		AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_2)
+		return;
+	}
+
+	dim_t i = 0;
+	dcomplex *x0 = x;
+	dcomplex *y0 = y;
+
+	if (bli_is_conj(conjx))
+	{
+
+		if (incx == 1 && incy == 1)
+		{
+			const dim_t n_elem_per_reg = 2;
+			__m256d x_vec[8];
+
+			__m256d conj_reg = _mm256_setr_pd(1, -1, 1, -1);
+
+			for (; (i + 15) < n; i += 16)
+			{
+				/* 4 double values = 2 double complex values are loaded*/
+				x_vec[0] = _mm256_loadu_pd((double *)x0);
+				x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+				x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg));
+				x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg));
+				x_vec[4] = _mm256_loadu_pd((double *)(x0 + 4 * n_elem_per_reg));
+				x_vec[5] = _mm256_loadu_pd((double *)(x0 + 5 * n_elem_per_reg));
+				x_vec[6] = _mm256_loadu_pd((double *)(x0 + 6 * n_elem_per_reg));
+				x_vec[7] = _mm256_loadu_pd((double *)(x0 + 7 * n_elem_per_reg));
+
+				/* Perform conjugation by multiplying the imaginary
+				   part with -1 and real part with 1*/
+				x_vec[0] = _mm256_mul_pd(x_vec[0], conj_reg);
+				x_vec[1] = _mm256_mul_pd(x_vec[1], conj_reg);
+				x_vec[2] = _mm256_mul_pd(x_vec[2], conj_reg);
+				x_vec[3] = _mm256_mul_pd(x_vec[3], conj_reg);
+				x_vec[4] = _mm256_mul_pd(x_vec[4], conj_reg);
+				x_vec[5] = _mm256_mul_pd(x_vec[5], conj_reg);
+				x_vec[6] = _mm256_mul_pd(x_vec[6], conj_reg);
+				x_vec[7] = _mm256_mul_pd(x_vec[7], conj_reg);
+
+				_mm256_storeu_pd((double *)y0, x_vec[0]);
+				_mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]);
+				_mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), x_vec[2]);
+				_mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), x_vec[3]);
+				_mm256_storeu_pd((double *)(y0 + 4 * n_elem_per_reg), x_vec[4]);
+				_mm256_storeu_pd((double *)(y0 + 5 * n_elem_per_reg), x_vec[5]);
+				_mm256_storeu_pd((double *)(y0 + 6 * n_elem_per_reg), x_vec[6]);
+				_mm256_storeu_pd((double *)(y0 + 7 * n_elem_per_reg), x_vec[7]);
+
+				x0 += 8 * n_elem_per_reg;
+				y0 += 8 * n_elem_per_reg;
+			}
+
+			for (; (i + 7) < n; i += 8)
+			{
+				x_vec[0] = _mm256_loadu_pd((double *)x0);
+				x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+				x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg));
+				x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg));
+
+				x_vec[0] = _mm256_mul_pd(x_vec[0], conj_reg);
+				x_vec[1] = _mm256_mul_pd(x_vec[1], conj_reg);
+				x_vec[2] = _mm256_mul_pd(x_vec[2], conj_reg);
+				x_vec[3] = _mm256_mul_pd(x_vec[3], conj_reg);
+
+				x0 += 4 * n_elem_per_reg;
+
+				_mm256_storeu_pd((double *)y0, x_vec[0]);
+				_mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]);
+				_mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), x_vec[2]);
+				_mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), x_vec[3]);
+
+				y0 += 4 * n_elem_per_reg;
+			}
+
+			for (; (i + 3) < n; i += 4)
+			{
+				x_vec[0] = _mm256_loadu_pd((double *)x0);
+				x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+
+				x0 += 2 * n_elem_per_reg;
+
+				x_vec[0] = _mm256_mul_pd(x_vec[0], conj_reg);
+				x_vec[1] = _mm256_mul_pd(x_vec[1], conj_reg);
+
+				_mm256_storeu_pd((double *)y0, x_vec[0]);
+				_mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]);
+
+				y0 += 2 * n_elem_per_reg;
+			}
+
+			for (; (i + 1) < n; i += 2)
+			{
+				x_vec[0] = _mm256_loadu_pd((double *)x0);
+
+				x_vec[0] = _mm256_mul_pd(x_vec[0], conj_reg);
+
+				x0 += n_elem_per_reg;
+
+				_mm256_storeu_pd((double *)y0, x_vec[0]);
+
+				y0 += n_elem_per_reg;
+			}
+
+			// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+			// This avoids a performance penalty caused by false dependencies when
+			// transitioning from AVX to SSE instructions (which may occur as soon
+			// as the n_left cleanup loop below if BLIS is compiled with
+			// -mfpmath=sse).
+			_mm256_zeroupper();
+		}
+		else
+		{
+			/*Since double complex elements are of size 128 bits, vectorization
+			can be done using XMM registers when incx and incy are not 1. This is done
+			in the else condition.*/
+			__m128d conj_reg = _mm_setr_pd(1, -1);
+			__m128d x_vec[4];
+
+			for (; (i + 3) < n; i += 4)
+			{
+				/* 2 double values = 1 double complex value(s) are(is) loaded*/
+				x_vec[0] = _mm_loadu_pd((double *)x0);
+				x_vec[1] = _mm_loadu_pd((double *)(x0 + incx));
+				x_vec[2] = _mm_loadu_pd((double *)(x0 + 2 * incx));
+				x_vec[3] = _mm_loadu_pd((double *)(x0 + 3 * incx));
+
+				x_vec[0] = _mm_mul_pd(x_vec[0], conj_reg);
+				x_vec[1] = _mm_mul_pd(x_vec[1], conj_reg);
+				x_vec[2] = _mm_mul_pd(x_vec[2], conj_reg);
+				x_vec[3] = _mm_mul_pd(x_vec[3], conj_reg);
+
+				_mm_storeu_pd((double *)y0, x_vec[0]);
+				_mm_storeu_pd((double *)(y0 + incy), x_vec[1]);
+				_mm_storeu_pd((double *)(y0 + 2 * incy), x_vec[2]);
+				_mm_storeu_pd((double *)(y0 + 3 * incy), x_vec[3]);
+
+				x0 += 4 * incx;
+				y0 += 4 * incy;
+			}
+
+			for (; (i + 1) < n; i += 2)
+			{
+				x_vec[0] = _mm_loadu_pd((double *)x0);
+				x_vec[1] = _mm_loadu_pd((double *)(x0 + incx));
+
+				x_vec[0] = _mm_mul_pd(x_vec[0], conj_reg);
+				x_vec[1] = _mm_mul_pd(x_vec[1], conj_reg);
+
+				_mm_storeu_pd((double *)y0, x_vec[0]);
+				_mm_storeu_pd((double *)(y0 + incy), x_vec[1]);
+
+				x0 += 2 * incx;
+				y0 += 2 * incy;
+			}
+		}
+
+		__m128d conj_reg = _mm_setr_pd(1, -1);
+		__m128d x_vec[1];
+
+		for (; i < n; i += 1)
+		{
+			x_vec[0] = _mm_loadu_pd((double *)x0);
+
+			x_vec[0] = _mm_mul_pd(x_vec[0], conj_reg);
+
+			_mm_storeu_pd((double *)y0, x_vec[0]);
+
+			x0 += incx;
+			y0 += incy;
+		}
+	}
+	else
+	{
+
+		if (incx == 1 && incy == 1)
+		{
+			const dim_t n_elem_per_reg = 2;
+			__m256d x_vec[8];
+
+			for (; (i + 15) < n; i += 16)
+			{
+				x_vec[0] = _mm256_loadu_pd((double *)x0);
+				x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+				x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg));
+				x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg));
+				x_vec[4] = _mm256_loadu_pd((double *)(x0 + 4 * n_elem_per_reg));
+				x_vec[5] = _mm256_loadu_pd((double *)(x0 + 5 * n_elem_per_reg));
+				x_vec[6] = _mm256_loadu_pd((double *)(x0 + 6 * n_elem_per_reg));
+				x_vec[7] = _mm256_loadu_pd((double *)(x0 + 7 * n_elem_per_reg));
+
+				x0 += 8 * n_elem_per_reg;
+
+				_mm256_storeu_pd((double *)y0, x_vec[0]);
+				_mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]);
+				_mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), x_vec[2]);
+				_mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), x_vec[3]);
+				_mm256_storeu_pd((double *)(y0 + 4 * n_elem_per_reg), x_vec[4]);
+				_mm256_storeu_pd((double *)(y0 + 5 * n_elem_per_reg), x_vec[5]);
+				_mm256_storeu_pd((double *)(y0 + 6 * n_elem_per_reg), x_vec[6]);
+				_mm256_storeu_pd((double *)(y0 + 7 * n_elem_per_reg), x_vec[7]);
+
+				y0 += 8 * n_elem_per_reg;
+			}
+
+			for (; (i + 7) < n; i += 8)
+			{
+				x_vec[0] = _mm256_loadu_pd((double *)x0);
+				x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+				x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg));
+				x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg));
+
+				x0 += 4 * n_elem_per_reg;
+
+				_mm256_storeu_pd((double *)y0, x_vec[0]);
+				_mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]);
+				_mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), x_vec[2]);
+				_mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), x_vec[3]);
+
+				y0 += 4 * n_elem_per_reg;
+			}
+
+			for (; (i + 3) < n; i += 4)
+			{
+				x_vec[0] = _mm256_loadu_pd((double *)x0);
+				x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+
+				x0 += 2 * n_elem_per_reg;
+
+				_mm256_storeu_pd((double *)y0, x_vec[0]);
+				_mm256_storeu_pd((double *)(y0 + n_elem_per_reg), x_vec[1]);
+
+				y0 += 2 * n_elem_per_reg;
+			}
+
+			for (; (i + 1) < n; i += 2)
+			{
+				x_vec[0] = _mm256_loadu_pd((double *)x0);
+
+				x0 += n_elem_per_reg;
+
+				_mm256_storeu_pd((double *)y0, x_vec[0]);
+
+				y0 += n_elem_per_reg;
+			}
+
+			// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+			// This avoids a performance penalty caused by false dependencies when
+			// transitioning from AVX to SSE instructions (which may occur as soon
+			// as the n_left cleanup loop below if BLIS is compiled with
+			// -mfpmath=sse).
+			_mm256_zeroupper();
+		}
+		else
+		{
+			/*Since double complex elements are of size 128 bits, vectorization
+			can be done using XMM registers when incx and incy are not 1. This is done
+			in the else condition.*/
+			__m128d x_vec[4];
+
+			for (; (i + 3) < n; i += 4)
+			{
+				x_vec[0] = _mm_loadu_pd((double *)x0);
+				x_vec[1] = _mm_loadu_pd((double *)(x0 + incx));
+				x_vec[2] = _mm_loadu_pd((double *)(x0 + 2 * incx));
+				x_vec[3] = _mm_loadu_pd((double *)(x0 + 3 * incx));
+
+				x0 += 4 * incx;
+
+				_mm_storeu_pd((double *)y0, x_vec[0]);
+				_mm_storeu_pd((double *)(y0 + incy), x_vec[1]);
+				_mm_storeu_pd((double *)(y0 + 2 * incy), x_vec[2]);
+				_mm_storeu_pd((double *)(y0 + 3 * incy), x_vec[3]);
+
+				y0 += 4 * incy;
+			}
+
+			for (; (i + 1) < n; i += 2)
+			{
+				x_vec[0] = _mm_loadu_pd((double *)x0);
+				x_vec[1] = _mm_loadu_pd((double *)(x0 + incx));
+
+				x0 += 2 * incx;
+
+				_mm_storeu_pd((double *)y0, x_vec[0]);
+				_mm_storeu_pd((double *)(y0 + incy), x_vec[1]);
+
+				y0 += 2 * incy;
+			}
+		}
+		__m128d x_vec[1];
+
+		for (; i < n; i += 1)
+		{
+			x_vec[0] = _mm_loadu_pd((double *)x0);
+
+			x0 += incx;
+
+			_mm_storeu_pd((double *)y0, x_vec[0]);
+
+			y0 += incy;
+		}
+	}
+}
diff --git a/kernels/zen/1/bli_dotv_zen_int.c b/kernels/zen/1/bli_dotv_zen_int.c
index 01022d353a..145b8fe6a5 100644
--- a/kernels/zen/1/bli_dotv_zen_int.c
+++ b/kernels/zen/1/bli_dotv_zen_int.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2016 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2018, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
@@ -153,8 +153,8 @@ void bli_sdotv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();
 
@@ -274,8 +274,8 @@ void bli_ddotv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();
 
diff --git a/kernels/zen/1/bli_dotv_zen_int10.c b/kernels/zen/1/bli_dotv_zen_int10.c
index a2e999779a..c239612006 100644
--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2016 - 2020, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2018, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
@@ -222,8 +222,8 @@ void bli_sdotv_zen_int10
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// later, especially if BLIS is compiled with -mfpmath=sse).
+		// transitioning from AVX to SSE instructions (which may occur later,
+		// especially if BLIS is compiled with -mfpmath=sse).
 		_mm256_zeroupper();
 	}
 	else
@@ -434,8 +434,8 @@ void bli_ddotv_zen_int10
 
 		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 		// This avoids a performance penalty caused by false dependencies when
-		// transitioning from from AVX to SSE instructions (which may occur
-		// later, especially if BLIS is compiled with -mfpmath=sse).
+		// transitioning from AVX to SSE instructions (which may occur later,
+		// especially if BLIS is compiled with -mfpmath=sse).
 		_mm256_zeroupper();
 	}
 	else
@@ -711,8 +711,8 @@ void bli_cdotv_zen_int5
         }
         // Issue vzeroupper instruction to clear upper lanes of ymm registers.
         // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // later, especially if BLIS is compiled with -mfpmath=sse).
+        // transitioning from AVX to SSE instructions (which may occur later,
+        // especially if BLIS is compiled with -mfpmath=sse).
         _mm256_zeroupper();
     }
     else
@@ -1000,8 +1000,8 @@ void bli_zdotv_zen_int5
 
         // Issue vzeroupper instruction to clear upper lanes of ymm registers.
         // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // later, especially if BLIS is compiled with -mfpmath=sse).
+        // transitioning from AVX to SSE instructions (which may occur later,
+        // especially if BLIS is compiled with -mfpmath=sse).
         _mm256_zeroupper();
     }
     else
diff --git a/kernels/zen/1/bli_dotxv_zen_int.c b/kernels/zen/1/bli_dotxv_zen_int.c
index c210eceff5..a0ddaaf549 100644
--- a/kernels/zen/1/bli_dotxv_zen_int.c
+++ b/kernels/zen/1/bli_dotxv_zen_int.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2016 - 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2016-2023, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2018, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
@@ -181,8 +181,8 @@ void bli_sdotxv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();
 
@@ -311,8 +311,8 @@ void bli_ddotxv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();
 
@@ -551,8 +551,8 @@ void bli_zdotxv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();
 
@@ -800,8 +800,8 @@ void bli_cdotxv_zen_int
 
 	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
 	// This avoids a performance penalty caused by false dependencies when
-	// transitioning from from AVX to SSE instructions (which may occur
-	// as soon as the n_left cleanup loop below if BLIS is compiled with
+	// transitioning from AVX to SSE instructions (which may occur as soon
+	// as the n_left cleanup loop below if BLIS is compiled with
 	// -mfpmath=sse).
 	_mm256_zeroupper();
 
diff --git a/kernels/zen/1/bli_norm2_zen_int.c b/kernels/zen/1/bli_norm2_zen_int.c
index 1971b79433..b388dfb754 100644
--- a/kernels/zen/1/bli_norm2_zen_int.c
+++ b/kernels/zen/1/bli_norm2_zen_int.c
@@ -1,955 +1,3025 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2021 - 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#include "immintrin.h"
-#include "blis.h"
-
-// Union data structure to access AVX registers
-// One 256-bit AVX register holds 8 SP elements. 
-typedef union
-{
-    __m256  v;
-    float   f[8] __attribute__( ( aligned( 64 ) ) );
-} v8sf_t;
-
-// Union data structure to access AVX registers
-// One 256-bit AVX register holds 4 DP elements. 
-typedef union
-{
-    __m256d v;
-    double  d[4] __attribute__( ( aligned( 64 ) ) );
-} v4df_t;
-
-// Return a mask which indicates either:
-// v <= t or v >= T
-#define CMP256( v, t, T ) \
-	_mm256_or_pd( _mm256_cmp_pd( v, t, _CMP_LE_OS ), _mm256_cmp_pd( v, T, _CMP_GE_OS ) );
-
-// Returns true if any of the values in the mask vector is true, 
-// and false, otherwise.
-static inline bool bli_horizontal_or( __m256d a ) { return ! _mm256_testz_pd( a, a ); }
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_dnorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       double*   x, inc_t incx,
-       double* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-
-    double sumsq = 0;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    double  *x_buf = x;
-
-    // Early return if n<=0 or incx=0
-    if ( ( n <= 0) || ( incx == 0 ) )
-    {
-        return;
-    }
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" double elements in vector x.
-        size_t buffer_size = n * sizeof( double );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dnorm2fv_unb_var1(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(double)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    double *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    double sum_sml = 0;
-   // Accumulator for medium values; no scaling required.
-    double sum_med = 0;
-    // Accumulator for big values; using scaling to avoid overflow.
-    double sum_big = 0;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
-    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
-    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
-    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP - 52 ) * 0.5 ) );
-
-    double scale;
-    double abs_chi;
-    bool isbig = false;
-
-    if ( n > 4 )
-    {
-        // Constants used for comparisons.
-        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
-        temp.v = _mm256_set1_pd( -0.0 );
-        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
-        thres_big_vec.v = _mm256_set1_pd( thres_big );
-        v4df_t x0v, x1v, mask_vec0, mask_vec1;
-        zerov.v  = _mm256_setzero_pd();
-
-        // Partial sums used for scaling.
-        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
-        sum_med_vec0.v = _mm256_setzero_pd();
-        sum_big_vec0.v = _mm256_setzero_pd();
-        sum_sml_vec0.v = _mm256_setzero_pd();
-        sum_med_vec1.v = _mm256_setzero_pd();
-        sum_big_vec1.v = _mm256_setzero_pd();
-        sum_sml_vec1.v = _mm256_setzero_pd();
-
-        for (; ( i + 8 ) <= n; i = i + 8)
-        {
-            x0v.v = _mm256_loadu_pd( xt );
-            x1v.v = _mm256_loadu_pd( xt + 4 );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                return;
-            }
-            if ( bli_horizontal_or( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); 
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            if ( !bli_horizontal_or( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); 
-                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            xt += 8;
-        }
-
-        for ( ; ( i + 4 ) <= n; i = i + 4 )
-        {
-            x0v.v = _mm256_loadu_pd( xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-            xt += 4;
-        }
-
-        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
-        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
-
-        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
-                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
-        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
-                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
-        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
-                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            abs_chi = bli_fabs( *xt );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-            xt++;
-        }
-    }
-
-    // Early return if there is an Inf.
-    if ( hasInf ) return;
-
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0 )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0 / scale_big;
-        sumsq = sum_big;
-    }
-
-    else if ( sum_sml > 0.0 )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0 )
-        {
-            sum_med = sqrt( sum_med );
-            sum_sml = sqrt( sum_sml ) / scale_sml;
-            double ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0;
-            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0 / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrt( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
-
-// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
-void bli_dznorm2fv_unb_var1_avx2
-    (
-       dim_t    n,
-       dcomplex*   x, inc_t incx,
-       double* norm,
-       cntx_t*  cntx
-    )
-{
-    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
-
-    double sumsq = 0;
-    dim_t i = 0;
-    dim_t n_remainder = 0;
-    dcomplex  *x_buf = x;
-
-    // Early return if n<=0 or incx=0
-    if ( ( n <= 0) || ( incx == 0 ) )
-    {
-        return;
-    }
-
-    // Memory pool declarations for packing vector X.
-    // Initialize mem pool buffer to NULL and size to 0.
-    // "buf" and "size" fields are assigned once memory
-    // is allocated from the pool in bli_membrk_acquire_m().
-    // This will ensure bli_mem_is_alloc() will be passed on
-    // an allocated memory if created or a NULL.
-    mem_t   mem_bufX = {0};
-    rntm_t  rntm;
-
-    // Packing for non-unit strided vector x.
-    if ( incx != 1 )
-    {
-        // In order to get the buffer from pool via rntm access to memory broker
-        //is needed. Following are initializations for rntm.
-        bli_rntm_init_from_global( &rntm );
-        bli_rntm_set_num_threads_only( 1, &rntm );
-        bli_membrk_rntm_set_membrk( &rntm );
-
-        // Calculate the size required for "n" dcomplex elements in vector x.
-        size_t buffer_size = n * sizeof( dcomplex );
-
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dznorm2fv_unb_var1(): get mem pool block\n" );
-        #endif
-
-        // Acquire a Buffer(n*size(dcomplex)) from the memory broker
-        // and save the associated mem_t entry to mem_bufX.
-        bli_membrk_acquire_m
-        (
-            &rntm,
-            buffer_size,
-            BLIS_BUFFER_FOR_B_PANEL,
-            &mem_bufX
-        );
-
-        // Continue packing X if buffer memory is allocated.
-        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
-        {
-            x_buf = bli_mem_buffer( &mem_bufX );
-            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
-            for ( dim_t x_index = 0; x_index < n; x_index++ )
-            {
-                if ( incx > 0 )
-                {
-                    *( x_buf + x_index ) = *( x + ( x_index * incx ) );
-                }
-                else
-                {
-                    *( x_buf + x_index ) =  *( x + ( - ( n - x_index - 1 ) * incx ) );
-                }
-            }
-        }
-    }
-
-    dcomplex *xt = x_buf;
-
-    // Compute the sum of squares on 3 accumulators to avoid overflow
-    // and underflow, depending on the vector element value.
-    // Accumulator for small values; using scaling to avoid underflow.
-    double sum_sml = 0;
-   // Accumulator for medium values; no scaling required.
-    double sum_med = 0;
-    // Accumulator for big values; using scaling to avoid overflow.
-    double sum_big = 0;
-
-    // Constants chosen to minimize roundoff, according to Blue's algorithm.
-    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
-    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
-    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
-    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP - 52 ) * 0.5 ) );
-
-    double scale;
-    double abs_chi;
-    bool isbig = false;
-
-    if ( n > 2 )
-    {
-        // Constants used for comparisons.
-        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
-        temp.v = _mm256_set1_pd( -0.0 );
-        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
-        thres_big_vec.v = _mm256_set1_pd( thres_big );
-        v4df_t x0v, x1v, mask_vec0, mask_vec1;
-        zerov.v  = _mm256_setzero_pd();
-
-        // Partial sums used for scaling.
-        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
-        sum_med_vec0.v = _mm256_setzero_pd();
-        sum_big_vec0.v = _mm256_setzero_pd();
-        sum_sml_vec0.v = _mm256_setzero_pd();
-        sum_med_vec1.v = _mm256_setzero_pd();
-        sum_big_vec1.v = _mm256_setzero_pd();
-        sum_sml_vec1.v = _mm256_setzero_pd();
-
-        for (; ( i + 4 ) <= n; i = i + 4)
-        {
-            x0v.v = _mm256_loadu_pd( (double*) xt );
-            x1v.v = _mm256_loadu_pd( (double*) (xt + 2) );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                return;
-            }
-            if ( bli_horizontal_or( mask_vec1.v ) )
-            {
-                *norm = NAN;
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-            mask_vec1.v = CMP256( x1v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            if ( !bli_horizontal_or( mask_vec1.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or( mask_vec1.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
-                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
-                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
-                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-
-            xt += 4;
-        }
-
-        for ( ; ( i + 2 ) <= n; i = i + 2 )
-        {
-            x0v.v = _mm256_loadu_pd( (double*) xt );
-
-            // Getting the abs of the vector elements.
-            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
-
-            // Check if any of the values is a NaN and if so, return.
-            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
-            if ( bli_horizontal_or( mask_vec0.v ) )
-            {
-                *norm = NAN;
-                return;
-            }
-
-            // Mask vectors which indicate whether
-            // xi<=thres_sml or xi>=thres_big.
-            mask_vec0.v = CMP256( x0v.v, thres_sml_vec.v, thres_big_vec.v );
-
-            if ( !bli_horizontal_or( mask_vec0.v ) )
-            {
-                // Scaling is not necessary; only medium values.
-                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
-            }
-            else
-            {
-                // Mask vector which indicate whether xi > thres_big.
-                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
-
-                if ( bli_horizontal_or( mask_vec0.v ) )
-                {
-                    isbig = true;
-
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Fill sum_big vector using scaling.
-                    temp.v = _mm256_set1_pd( scale_big );
-                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
-                    temp.v = _mm256_set1_pd( -0.0 );
-                }
-                else
-                {
-                    // Mask vector which indicates whether xi > thres_small.
-                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
-                    // Fill sum_med vector without scaling.
-                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
-                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
-
-                    // Accumulate small values only if there have not been any big values so far.
-                    if ( !isbig )
-                    {
-                        // Fill sum_sml vector using scaling.
-                        temp.v = _mm256_set1_pd( scale_sml );
-                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
-                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
-                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
-                        temp.v = _mm256_set1_pd( -0.0 );
-                    }
-                }
-            }
-            xt += 2;
-        }
-
-        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
-        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
-        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
-
-        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
-                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
-        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
-                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
-        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
-                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
-    }
-
-    n_remainder = n - i;
-    bool hasInf = false;
-    if ( ( n_remainder > 0 ) )
-    {
-        // Put first the most likely to happen to avoid evaluations on if statements.
-        for (i = 0; i < n_remainder; i++)
-        {
-            // Get real and imaginary component of the vector element.
-            double chi_r, chi_i;
-            bli_zdgets(*xt, chi_r, chi_i);
-
-            // Start with accumulating the real component of the vector element.
-            abs_chi = bli_fabs( chi_r );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-
-            // Accumulate the imaginary component of the vector element.
-            abs_chi = bli_fabs( chi_i );
-            // If any of the elements is NaN, then return NaN as a result.
-            if ( bli_isnan( abs_chi ) )
-            {
-                *norm = abs_chi;
-                return;
-            }
-            // Else, if any of the elements is an Inf, then return +Inf as a result.
-            if ( bli_isinf( abs_chi ) )
-            {
-                *norm = abs_chi;
-                // Instead of returning immediately, use this flag
-                // to denote that there is an Inf element in the vector.
-                // That is used to avoid cases where there is a NaN which comes
-                // after an Inf.
-                hasInf = true;
-            }
-            // Most likely case: medium values, not over/under-flow.
-            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
-            {
-                sum_med += abs_chi * abs_chi;
-            }
-            // Case where there could be an overflow. Scaling is required.
-            else if ( abs_chi > thres_big )
-            {
-                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
-                isbig = true;
-            }
-            // Case where there could be an underflow. Scaling is required.
-            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
-            {
-                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
-            }
-
-            xt++;
-        }
-    }
-
-    // Early return if there is an Inf.
-    if ( hasInf ) return;
-
-    // Combine accumulators.
-    if ( isbig )
-    {
-        // Combine sum_big and sum_med if sum_med > 0.
-        if ( sum_med > 0.0 )
-        {
-            sum_big += ( sum_med * scale_big ) * scale_big;
-        }
-        scale = 1.0 / scale_big;
-        sumsq = sum_big;
-    }
-
-    else if ( sum_sml > 0.0 )
-    {
-        // Combine sum_med and sum_sml if sum_sml>0.
-        if ( sum_med > 0.0 )
-        {
-            sum_med = sqrt( sum_med );
-            sum_sml = sqrt( sum_sml ) / scale_sml;
-            double ymin, ymax;
-            if ( sum_sml > sum_med )
-            {
-                ymin = sum_med;
-                ymax = sum_sml;
-            }
-            else
-            {
-                ymin = sum_sml;
-                ymax = sum_med;
-            }
-            scale = 1.0;
-            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
-        }
-        else
-        {
-            scale = 1.0 / scale_sml;
-            sumsq = sum_sml;
-        }
-    }
-    else
-    {
-        // If all values are mid-range:
-        scale = 1.0;
-        sumsq = sum_med;
-    }
-
-    *norm = scale * sqrt( sumsq );
-
-    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
-    {
-        #ifdef BLIS_ENABLE_MEM_TRACING
-            printf( "bli_dznorm2fv_unb_var1(): releasing mem pool block\n" );
-        #endif
-        // Return the buffer to pool.
-        bli_membrk_release( &rntm , &mem_bufX );
-    }
-
-    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
-
-    return;
-}
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2021 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "immintrin.h"
+#include "blis.h"
+
+// Union data structure to access AVX registers
+// One 256-bit AVX register holds 8 SP elements. 
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__( ( aligned( 64 ) ) );
+} v8sf_t;
+
+// Union data structure to access AVX registers
+// One 256-bit AVX register holds 4 DP elements. 
+typedef union
+{
+    __m256d v;
+    double  d[4] __attribute__( ( aligned( 64 ) ) );
+} v4df_t;
+
+// Return a mask which indicates either:
+// v <= t or v >= T
+#define CMP256_sf( v, t, T ) \
+	_mm256_or_ps( _mm256_cmp_ps( v, t, _CMP_LE_OS ), _mm256_cmp_ps( v, T, _CMP_GE_OS ) );
+
+#define CMP256_df( v, t, T ) \
+	_mm256_or_pd( _mm256_cmp_pd( v, t, _CMP_LE_OS ), _mm256_cmp_pd( v, T, _CMP_GE_OS ) );
+
+// Returns true if any of the values in the mask vector a is true, 
+// and false, otherwise.
+// In more detail, __mm256_testz_ps() performs the bitwise (a AND b) operation and returns:
+//    1 if the sign bit of all bitwise operations is 0,
+//    0 if at least one of the sign bits of each bitwise operation is 1.
+// The sign bit of (a AND a) will be 1 iff the sign bit of a is 1, and 0 otherwise.
+// That means that __mm256_testz_ps(a,a) returns:
+//    1 if the sign bit of all elements in a is 0,
+//    0 if at least one of the sign bits of a is 1.
+// Because of the negation, bli_horizontal_or_sf() returns:
+//    0 if the sign bit of all elements in a is 0,
+//    1 if at least one of the sign bits of a is 1. 
+// Since a is the result of a masking operation, bli_horizontal_or_sf() returns:
+//    0 (false) if the mask is false for all elements in a,
+//    1 (true)  if the mask is true for at least one element in a.
+static inline bool bli_horizontal_or_sf( __m256  a ) { return ! _mm256_testz_ps( a, a ); }
+static inline bool bli_horizontal_or_df( __m256d a ) { return ! _mm256_testz_pd( a, a ); }
+
+float horizontal_add_sf(__m256 const a) {
+    __m256 t1 = _mm256_hadd_ps(a, a);
+    __m256 t2 = _mm256_hadd_ps(t1,t1);
+    __m128 t3 = _mm256_extractf128_ps(t2,1);
+    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
+    return _mm_cvtss_f32(t4); // sign extend to 32 bits
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_snorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       float*   x, inc_t incx,
+       float* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+    
+    float sumsq = 0.0f;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    float  *x_buf = x;
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" float elements in vector x.
+        size_t buffer_size = n * sizeof( float );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_snorm2fv_unb_var1_avx2(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(float)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    float *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    float sum_sml = 0.0f;
+    // Accumulator for medium values; no scaling required.
+    float sum_med = 0.0f;
+    // Accumulator for big values; using scaling to avoid overflow.
+    float sum_big = 0.0f;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
+    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
+    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
+    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
+
+    float scale = 1.0f;
+    float abs_chi;
+    bool isbig = false;
+
+    if ( n >= 64 )
+    {
+        // Constants used for comparisons.
+        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
+        temp.v = _mm256_set1_ps( -0.0f );
+        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
+        thres_big_vec.v = _mm256_set1_ps( thres_big );
+        v8sf_t x0v, x1v, x2v, x3v;
+        v8sf_t y0v, y1v, y2v, y3v;
+        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
+        zerov.v  = _mm256_setzero_ps();
+
+        // Partial sums used for scaling.
+        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
+        sum_sml_vec0.v = _mm256_setzero_ps();
+        sum_sml_vec1.v = _mm256_setzero_ps();
+        sum_sml_vec2.v = _mm256_setzero_ps();
+        sum_sml_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
+        sum_med_vec0.v = _mm256_setzero_ps();
+        sum_med_vec1.v = _mm256_setzero_ps();
+        sum_med_vec2.v = _mm256_setzero_ps();
+        sum_med_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
+        sum_big_vec0.v = _mm256_setzero_ps();
+        sum_big_vec1.v = _mm256_setzero_ps();
+        sum_big_vec2.v = _mm256_setzero_ps();
+        sum_big_vec3.v = _mm256_setzero_ps();
+
+        for (; ( i + 32 ) <= n; i = i + 32)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+            x2v.v = _mm256_loadu_ps( xt + 16 );
+            x3v.v = _mm256_loadu_ps( xt + 24 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+                
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {                    
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec3.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 32;
+        }
+
+        for (; ( i + 24 ) <= n; i = i + 24)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+            x2v.v = _mm256_loadu_ps( xt + 16 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            
+            xt += 24;
+        }
+
+        for (; ( i + 16 ) <= n; i = i + 16)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+            x1v.v = _mm256_loadu_ps( xt + 8 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 16;
+        }
+        
+        // This seems to not be improving performance.
+        #if 0
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_ps( xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 8;
+        }
+        #endif
+
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
+        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
+
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
+        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
+        sum_med = horizontal_add_sf(sum_med_vec0.v);
+
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
+        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
+        sum_big = horizontal_add_sf(sum_big_vec0.v);
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            abs_chi = bli_fabs( *xt );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            xt++;
+        }
+    }
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {
+        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+    
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0f )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0f / scale_big;
+        sumsq = sum_big;
+    }
+    else if ( sum_sml > 0.0f )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0f )
+        {
+            sum_med = sqrtf( sum_med );
+            sum_sml = sqrtf( sum_sml ) / scale_sml;
+            float ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0f;
+            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0f / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0f;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrtf( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_snorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_scnorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       scomplex*   x, inc_t incx,
+       float* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    float sumsq = 0.0f;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    scomplex  *x_buf = x;
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" scomplex elements in vector x.
+        size_t buffer_size = n * sizeof( scomplex );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_scnorm2fv_unb_var1_avx2(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(scomplex)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    scomplex *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    float sum_sml = 0.0f;
+   // Accumulator for medium values; no scaling required.
+    float sum_med = 0.0f;
+    // Accumulator for big values; using scaling to avoid overflow.
+    float sum_big = 0.0f;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const float thres_sml = powf( ( float )FLT_RADIX,    ceilf( ( FLT_MIN_EXP - 1 )  * 0.5f ) );
+    const float thres_big = powf( ( float )FLT_RADIX,   floorf( ( FLT_MAX_EXP - 23)  * 0.5f ) );
+    const float scale_sml = powf( ( float )FLT_RADIX, - floorf( ( FLT_MIN_EXP - 24 ) * 0.5f ) );
+    const float scale_big = powf( ( float )FLT_RADIX,  - ceilf( ( FLT_MAX_EXP + 23 ) * 0.5f ) );
+
+    float scale = 1.0f;
+    float abs_chi;
+    bool isbig = false;
+
+    if ( n >= 64 )
+    {
+        // Constants used for comparisons.
+        v8sf_t temp, thres_sml_vec, thres_big_vec, zerov;
+        temp.v = _mm256_set1_ps( -0.0f );
+        thres_sml_vec.v = _mm256_set1_ps( thres_sml );
+        thres_big_vec.v = _mm256_set1_ps( thres_big );
+        v8sf_t x0v, x1v, x2v, x3v;
+        v8sf_t y0v, y1v, y2v, y3v;
+        v8sf_t mask_vec0, mask_vec1, mask_vec2, mask_vec3;
+        zerov.v  = _mm256_setzero_ps();
+
+        // Partial sums used for scaling.
+        v8sf_t sum_sml_vec0, sum_sml_vec1, sum_sml_vec2, sum_sml_vec3;
+        sum_sml_vec0.v = _mm256_setzero_ps();
+        sum_sml_vec1.v = _mm256_setzero_ps();
+        sum_sml_vec2.v = _mm256_setzero_ps();
+        sum_sml_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_med_vec0, sum_med_vec1, sum_med_vec2, sum_med_vec3;
+        sum_med_vec0.v = _mm256_setzero_ps();
+        sum_med_vec1.v = _mm256_setzero_ps();
+        sum_med_vec2.v = _mm256_setzero_ps();
+        sum_med_vec3.v = _mm256_setzero_ps();
+
+        v8sf_t sum_big_vec0, sum_big_vec1, sum_big_vec2, sum_big_vec3;
+        sum_big_vec0.v = _mm256_setzero_ps();
+        sum_big_vec1.v = _mm256_setzero_ps();
+        sum_big_vec2.v = _mm256_setzero_ps();
+        sum_big_vec3.v = _mm256_setzero_ps();
+
+        for (; ( i + 16 ) <= n; i = i + 16)
+        {
+            x0v.v = _mm256_loadu_ps( (float*) xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
+            x3v.v = _mm256_loadu_ps( (float*) (xt + 12) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+            x3v.v = _mm256_andnot_ps( temp.v, x3v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            mask_vec3.v = _mm256_cmp_ps(x3v.v, x3v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec3.v = CMP256_sf( x3v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+                
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {                    
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_sf( mask_vec3.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec3.v = _mm256_fmadd_ps( x3v.v, x3v.v, sum_med_vec3.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec3.v ) )
+                {
+                    isbig = true;
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                    y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                    sum_big_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_big_vec3.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec3.v = _mm256_cmp_ps( x3v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y3v.v = _mm256_blendv_ps( x3v.v, zerov.v, mask_vec3.v );
+                    sum_med_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_med_vec3.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y3v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec3.v );
+                        y3v.v = _mm256_mul_ps( x3v.v, y3v.v );
+                        sum_sml_vec3.v = _mm256_fmadd_ps( y3v.v, y3v.v, sum_sml_vec3.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 16;
+        }
+
+        for (; ( i + 12 ) <= n; i = i + 12)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+            x2v.v = _mm256_loadu_ps( (float*) (xt + 8) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+            x2v.v = _mm256_andnot_ps( temp.v, x2v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            mask_vec2.v = _mm256_cmp_ps(x2v.v, x2v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec2.v = CMP256_sf( x2v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec2.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec2.v = _mm256_fmadd_ps( x2v.v, x2v.v, sum_med_vec2.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec2.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                    y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                    sum_big_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_big_vec2.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec2.v = _mm256_cmp_ps( x2v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y2v.v = _mm256_blendv_ps( x2v.v, zerov.v, mask_vec2.v );
+                    sum_med_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_med_vec2.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y2v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec2.v );
+                        y2v.v = _mm256_mul_ps( x2v.v, y2v.v );
+                        sum_sml_vec2.v = _mm256_fmadd_ps( y2v.v, y2v.v, sum_sml_vec2.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            
+            xt += 12;
+        }
+        
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+            x1v.v = _mm256_loadu_ps( (float*) (xt + 4) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_ps( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_ps(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_sf( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            if ( !bli_horizontal_or_sf( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_ps( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                    y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                    sum_big_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_big_vec1.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_ps( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y1v.v = _mm256_blendv_ps( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y1v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec1.v );
+                        y1v.v = _mm256_mul_ps( x1v.v, y1v.v );
+                        sum_sml_vec1.v = _mm256_fmadd_ps( y1v.v, y1v.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 8;
+        }
+        // This seems to not be improving performance.
+        #if 0
+        for (; ( i + 4 ) <= n; i = i + 4)
+        {
+            x0v.v = _mm256_loadu_ps( (float*)xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_ps( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_ps(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_sf( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_sf( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_ps( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_sf( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_ps( scale_big );
+                    y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                    y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                    sum_big_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_ps( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_ps( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    y0v.v = _mm256_blendv_ps( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_ps( scale_sml );
+                        y0v.v = _mm256_blendv_ps( zerov.v, temp.v, mask_vec0.v );
+                        y0v.v = _mm256_mul_ps( x0v.v, y0v.v );
+                        sum_sml_vec0.v = _mm256_fmadd_ps( y0v.v, y0v.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_ps( -0.0 );
+                    }
+                }
+            }
+            xt += 4;
+        }
+        #endif
+
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_sml_vec2.v = _mm256_add_ps( sum_sml_vec2.v, sum_sml_vec3.v );
+        sum_sml_vec0.v = _mm256_add_ps( sum_sml_vec0.v, sum_sml_vec2.v ); 
+        sum_sml = horizontal_add_sf(sum_sml_vec0.v);
+
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec1.v );
+        sum_med_vec2.v = _mm256_add_ps( sum_med_vec2.v, sum_med_vec3.v );
+        sum_med_vec0.v = _mm256_add_ps( sum_med_vec0.v, sum_med_vec2.v );
+        sum_med = horizontal_add_sf(sum_med_vec0.v);
+
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec1.v );
+        sum_big_vec2.v = _mm256_add_ps( sum_big_vec2.v, sum_big_vec3.v );
+        sum_big_vec0.v = _mm256_add_ps( sum_big_vec0.v, sum_big_vec2.v );
+        sum_big = horizontal_add_sf(sum_big_vec0.v);
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    double chi_r, chi_i;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            // Get real and imaginary component of the vector element.            
+            bli_csgets(*xt, chi_r, chi_i);
+            // Start with accumulating the real component of the vector element.
+            abs_chi = bli_fabs( chi_r );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            // Accumulate the imaginary component of the vector element.
+            abs_chi = bli_fabs( chi_i );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            xt++;
+        }
+    }
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+    
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0f )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0f / scale_big;
+        sumsq = sum_big;
+    }
+    else if ( sum_sml > 0.0f )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0f )
+        {
+            sum_med = sqrtf( sum_med );
+            sum_sml = sqrtf( sum_sml ) / scale_sml;
+            float ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0f;
+            sumsq = ymax * ymax * ( 1.0f + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0f / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0f;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrtf( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_scnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_dnorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       double*   x, inc_t incx,
+       double* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    double sumsq = 0;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    double  *x_buf = x;
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" double elements in vector x.
+        size_t buffer_size = n * sizeof( double );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dnorm2fv_unb_var1(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(double)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    double *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    double sum_sml = 0;
+   // Accumulator for medium values; no scaling required.
+    double sum_med = 0;
+    // Accumulator for big values; using scaling to avoid overflow.
+    double sum_big = 0;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
+    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
+    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
+    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
+
+    double scale;
+    double abs_chi;
+    bool isbig = false;
+
+    if ( n > 4 )
+    {
+        // Constants used for comparisons.
+        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
+        temp.v = _mm256_set1_pd( -0.0 );
+        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
+        thres_big_vec.v = _mm256_set1_pd( thres_big );
+        v4df_t x0v, x1v, mask_vec0, mask_vec1;
+        zerov.v  = _mm256_setzero_pd();
+
+        // Partial sums used for scaling.
+        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
+        sum_med_vec0.v = _mm256_setzero_pd();
+        sum_big_vec0.v = _mm256_setzero_pd();
+        sum_sml_vec0.v = _mm256_setzero_pd();
+        sum_med_vec1.v = _mm256_setzero_pd();
+        sum_big_vec1.v = _mm256_setzero_pd();
+        sum_sml_vec1.v = _mm256_setzero_pd();
+
+        for (; ( i + 8 ) <= n; i = i + 8)
+        {
+            x0v.v = _mm256_loadu_pd( xt );
+            x1v.v = _mm256_loadu_pd( xt + 4 );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v ); 
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v ); 
+                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            xt += 8;
+        }
+
+        for ( ; ( i + 4 ) <= n; i = i + 4 )
+        {
+            x0v.v = _mm256_loadu_pd( xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+            xt += 4;
+        }
+
+        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
+        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
+
+        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
+                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
+        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
+                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
+        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
+                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            abs_chi = bli_fabs( *xt );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if (  ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+            xt++;
+        }
+    }
+
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0 )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0 / scale_big;
+        sumsq = sum_big;
+    }
+
+    else if ( sum_sml > 0.0 )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0 )
+        {
+            sum_med = sqrt( sum_med );
+            sum_sml = sqrt( sum_sml ) / scale_sml;
+            double ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0;
+            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0 / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrt( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dnorm2fv_unb_var1(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
+
+// Optimized function that computes the Frobenius norm using AVX2 intrinsics.
+void bli_dznorm2fv_unb_var1_avx2
+    (
+       dim_t    n,
+       dcomplex*   x, inc_t incx,
+       double* norm,
+       cntx_t*  cntx
+    )
+{
+    AOCL_DTL_TRACE_ENTRY( AOCL_DTL_LEVEL_TRACE_3 );
+
+    double sumsq = 0;
+    dim_t i = 0;
+    dim_t n_remainder = 0;
+    dcomplex  *x_buf = x;
+
+    // Memory pool declarations for packing vector X.
+    // Initialize mem pool buffer to NULL and size to 0.
+    // "buf" and "size" fields are assigned once memory
+    // is allocated from the pool in bli_membrk_acquire_m().
+    // This will ensure bli_mem_is_alloc() will be passed on
+    // an allocated memory if created or a NULL.
+    mem_t   mem_bufX = {0};
+    rntm_t  rntm;
+
+    // Packing for non-unit strided vector x.
+    if ( incx != 1 )
+    {
+        // In order to get the buffer from pool via rntm access to memory broker
+        //is needed. Following are initializations for rntm.
+        bli_rntm_init_from_global( &rntm );
+        bli_rntm_set_num_threads_only( 1, &rntm );
+        bli_membrk_rntm_set_membrk( &rntm );
+
+        // Calculate the size required for "n" dcomplex elements in vector x.
+        size_t buffer_size = n * sizeof( dcomplex );
+
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dznorm2fv_unb_var1(): get mem pool block\n" );
+        #endif
+
+        // Acquire a Buffer(n*size(dcomplex)) from the memory broker
+        // and save the associated mem_t entry to mem_bufX.
+        bli_membrk_acquire_m
+        (
+            &rntm,
+            buffer_size,
+            BLIS_BUFFER_FOR_B_PANEL,
+            &mem_bufX
+        );
+
+        // Continue packing X if buffer memory is allocated.
+        if ( ( bli_mem_is_alloc( &mem_bufX ) ) )
+        {
+            x_buf = bli_mem_buffer( &mem_bufX );
+            // Pack vector x with non-unit stride to a temp buffer x_buf with unit stride.
+            for ( dim_t x_index = 0; x_index < n; x_index++ )
+            {
+                *( x_buf + x_index ) = *( x + ( x_index * incx ) );
+            }
+        }
+    }
+
+    dcomplex *xt = x_buf;
+
+    // Compute the sum of squares on 3 accumulators to avoid overflow
+    // and underflow, depending on the vector element value.
+    // Accumulator for small values; using scaling to avoid underflow.
+    double sum_sml = 0;
+   // Accumulator for medium values; no scaling required.
+    double sum_med = 0;
+    // Accumulator for big values; using scaling to avoid overflow.
+    double sum_big = 0;
+
+    // Constants chosen to minimize roundoff, according to Blue's algorithm.
+    const double thres_sml = pow( ( double )FLT_RADIX,    ceil( ( DBL_MIN_EXP - 1 )  * 0.5 ) );
+    const double thres_big = pow( ( double )FLT_RADIX,   floor( ( DBL_MAX_EXP - 52)  * 0.5 ) );
+    const double scale_sml = pow( ( double )FLT_RADIX, - floor( ( DBL_MIN_EXP - 53 ) * 0.5 ) );
+    const double scale_big = pow( ( double )FLT_RADIX,  - ceil( ( DBL_MAX_EXP + 52 ) * 0.5 ) );
+
+    double scale;
+    double abs_chi;
+    bool isbig = false;
+
+    if ( n > 2 )
+    {
+        // Constants used for comparisons.
+        v4df_t temp, thres_sml_vec, thres_big_vec, zerov, ymm0, ymm1;
+        temp.v = _mm256_set1_pd( -0.0 );
+        thres_sml_vec.v = _mm256_set1_pd( thres_sml );
+        thres_big_vec.v = _mm256_set1_pd( thres_big );
+        v4df_t x0v, x1v, mask_vec0, mask_vec1;
+        zerov.v  = _mm256_setzero_pd();
+
+        // Partial sums used for scaling.
+        v4df_t sum_med_vec0, sum_big_vec0, sum_sml_vec0, sum_med_vec1, sum_big_vec1, sum_sml_vec1;
+        sum_med_vec0.v = _mm256_setzero_pd();
+        sum_big_vec0.v = _mm256_setzero_pd();
+        sum_sml_vec0.v = _mm256_setzero_pd();
+        sum_med_vec1.v = _mm256_setzero_pd();
+        sum_big_vec1.v = _mm256_setzero_pd();
+        sum_sml_vec1.v = _mm256_setzero_pd();
+
+        for (; ( i + 4 ) <= n; i = i + 4)
+        {
+            x0v.v = _mm256_loadu_pd( (double*) xt );
+            x1v.v = _mm256_loadu_pd( (double*) (xt + 2) );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+            x1v.v = _mm256_andnot_pd( temp.v, x1v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            mask_vec1.v = _mm256_cmp_pd(x1v.v, x1v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            if ( bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+            mask_vec1.v = CMP256_df( x1v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            if ( !bli_horizontal_or_df( mask_vec1.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec1.v = _mm256_fmadd_pd( x1v.v, x1v.v, sum_med_vec1.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec1.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                    ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                    sum_big_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_big_vec1.v ); 
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec1.v = _mm256_cmp_pd( x1v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm1.v = _mm256_blendv_pd( x1v.v, zerov.v, mask_vec1.v );
+                    sum_med_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_med_vec1.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm1.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec1.v );
+                        ymm1.v = _mm256_mul_pd( x1v.v, ymm1.v );
+                        sum_sml_vec1.v = _mm256_fmadd_pd( ymm1.v, ymm1.v, sum_sml_vec1.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+
+            xt += 4;
+        }
+
+        for ( ; ( i + 2 ) <= n; i = i + 2 )
+        {
+            x0v.v = _mm256_loadu_pd( (double*) xt );
+
+            // Getting the abs of the vector elements.
+            x0v.v = _mm256_andnot_pd( temp.v, x0v.v );
+
+            // Check if any of the values is a NaN and if so, return.
+            mask_vec0.v = _mm256_cmp_pd(x0v.v, x0v.v, _CMP_UNORD_Q);
+            if ( bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                *norm = NAN;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+
+            // Mask vectors which indicate whether
+            // xi<=thres_sml or xi>=thres_big.
+            mask_vec0.v = CMP256_df( x0v.v, thres_sml_vec.v, thres_big_vec.v );
+
+            if ( !bli_horizontal_or_df( mask_vec0.v ) )
+            {
+                // Scaling is not necessary; only medium values.
+                sum_med_vec0.v = _mm256_fmadd_pd( x0v.v, x0v.v, sum_med_vec0.v );
+            }
+            else
+            {
+                // Mask vector which indicate whether xi > thres_big.
+                mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_big_vec.v, _CMP_GT_OQ );
+
+                if ( bli_horizontal_or_df( mask_vec0.v ) )
+                {
+                    isbig = true;
+
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Fill sum_big vector using scaling.
+                    temp.v = _mm256_set1_pd( scale_big );
+                    ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                    ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                    sum_big_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_big_vec0.v );
+                    temp.v = _mm256_set1_pd( -0.0 );
+                }
+                else
+                {
+                    // Mask vector which indicates whether xi > thres_small.
+                    mask_vec0.v = _mm256_cmp_pd( x0v.v, thres_sml_vec.v, _CMP_LT_OQ );
+                    // Fill sum_med vector without scaling.
+                    ymm0.v = _mm256_blendv_pd( x0v.v, zerov.v, mask_vec0.v );
+                    sum_med_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_med_vec0.v );
+
+                    // Accumulate small values only if there have not been any big values so far.
+                    if ( !isbig )
+                    {
+                        // Fill sum_sml vector using scaling.
+                        temp.v = _mm256_set1_pd( scale_sml );
+                        ymm0.v = _mm256_blendv_pd( zerov.v, temp.v, mask_vec0.v );
+                        ymm0.v = _mm256_mul_pd( x0v.v, ymm0.v );
+                        sum_sml_vec0.v = _mm256_fmadd_pd( ymm0.v, ymm0.v, sum_sml_vec0.v );
+                        temp.v = _mm256_set1_pd( -0.0 );
+                    }
+                }
+            }
+            xt += 2;
+        }
+
+        sum_sml_vec0.v = _mm256_add_pd( sum_sml_vec0.v, sum_sml_vec1.v );
+        sum_med_vec0.v = _mm256_add_pd( sum_med_vec0.v, sum_med_vec1.v );
+        sum_big_vec0.v = _mm256_add_pd( sum_big_vec0.v, sum_big_vec1.v );
+
+        sum_sml += sum_sml_vec0.v[0] + sum_sml_vec0.v[1]
+                + sum_sml_vec0.v[2] + sum_sml_vec0.v[3];
+        sum_med += sum_med_vec0.v[0] + sum_med_vec0.v[1]
+                + sum_med_vec0.v[2] + sum_med_vec0.v[3];
+        sum_big += sum_big_vec0.v[0] + sum_big_vec0.v[1]
+                + sum_big_vec0.v[2] + sum_big_vec0.v[3];
+    }
+
+    n_remainder = n - i;
+    bool hasInf = false;
+    double chi_r, chi_i;
+    if ( ( n_remainder > 0 ) )
+    {
+        // Put first the most likely to happen to avoid evaluations on if statements.
+        for (i = 0; i < n_remainder; i++)
+        {
+            // Get real and imaginary component of the vector element.
+            bli_zdgets(*xt, chi_r, chi_i);
+
+            // Start with accumulating the real component of the vector element.
+            abs_chi = bli_fabs( chi_r );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            // Accumulate the imaginary component of the vector element.
+            abs_chi = bli_fabs( chi_i );
+            // If any of the elements is NaN, then return NaN as a result.
+            if ( bli_isnan( abs_chi ) )
+            {
+                *norm = abs_chi;
+                if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+                {
+                    #ifdef BLIS_ENABLE_MEM_TRACING
+                        printf( "bli_dznorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+                    #endif
+                    // Return the buffer to pool.
+                    bli_membrk_release( &rntm , &mem_bufX );
+                }
+
+                AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+                return;
+            }
+            // Else, if any of the elements is an Inf, then return +Inf as a result.
+            if ( bli_isinf( abs_chi ) )
+            {
+                *norm = abs_chi;
+                // Instead of returning immediately, use this flag
+                // to denote that there is an Inf element in the vector.
+                // That is used to avoid cases where there is a NaN which comes
+                // after an Inf.
+                hasInf = true;
+            }
+            // Most likely case: medium values, not over/under-flow.
+            if ( ( abs_chi <= thres_big ) && ( abs_chi >= thres_sml ) )
+            {
+                sum_med += abs_chi * abs_chi;
+            }
+            // Case where there could be an overflow. Scaling is required.
+            else if ( abs_chi > thres_big )
+            {
+                sum_big += ( abs_chi * scale_big ) * ( abs_chi * scale_big );
+                isbig = true;
+            }
+            // Case where there could be an underflow. Scaling is required.
+            else if ( ( !isbig ) && ( abs_chi < thres_sml ) )
+            {
+                sum_sml += ( abs_chi * scale_sml ) * ( abs_chi * scale_sml );
+            }
+
+            xt++;
+        }
+    }
+
+    // Early return if there is an Inf.
+    if ( hasInf ) 
+    {        
+        if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+        {
+            #ifdef BLIS_ENABLE_MEM_TRACING
+                printf( "bli_dnorm2fv_unb_var1_avx2(): releasing mem pool block\n" );
+            #endif
+            // Return the buffer to pool.
+            bli_membrk_release( &rntm , &mem_bufX );
+        }
+
+        AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+        return;
+    }
+
+    // Combine accumulators.
+    if ( isbig )
+    {
+        // Combine sum_big and sum_med if sum_med > 0.
+        if ( sum_med > 0.0 )
+        {
+            sum_big += ( sum_med * scale_big ) * scale_big;
+        }
+        scale = 1.0 / scale_big;
+        sumsq = sum_big;
+    }
+
+    else if ( sum_sml > 0.0 )
+    {
+        // Combine sum_med and sum_sml if sum_sml>0.
+        if ( sum_med > 0.0 )
+        {
+            sum_med = sqrt( sum_med );
+            sum_sml = sqrt( sum_sml ) / scale_sml;
+            double ymin, ymax;
+            if ( sum_sml > sum_med )
+            {
+                ymin = sum_med;
+                ymax = sum_sml;
+            }
+            else
+            {
+                ymin = sum_sml;
+                ymax = sum_med;
+            }
+            scale = 1.0;
+            sumsq = ymax * ymax * ( 1.0 + ( ymin / ymax ) * ( ymin / ymax ) );
+        }
+        else
+        {
+            scale = 1.0 / scale_sml;
+            sumsq = sum_sml;
+        }
+    }
+    else
+    {
+        // If all values are mid-range:
+        scale = 1.0;
+        sumsq = sum_med;
+    }
+
+    *norm = scale * sqrt( sumsq );
+
+    if ( ( incx != 1 ) && bli_mem_is_alloc( &mem_bufX ) )
+    {
+        #ifdef BLIS_ENABLE_MEM_TRACING
+            printf( "bli_dznorm2fv_unb_var1(): releasing mem pool block\n" );
+        #endif
+        // Return the buffer to pool.
+        bli_membrk_release( &rntm , &mem_bufX );
+    }
+
+    AOCL_DTL_TRACE_EXIT( AOCL_DTL_LEVEL_TRACE_3 );
+
+    return;
+}
diff --git a/kernels/zen/1/bli_scal2v_zen_int.c b/kernels/zen/1/bli_scal2v_zen_int.c
new file mode 100644
index 0000000000..1c91138cf0
--- /dev/null
+++ b/kernels/zen/1/bli_scal2v_zen_int.c
@@ -0,0 +1,569 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include <immintrin.h>
+
+/*  This kernel performs  y := alpha * conjx(x)
+
+    alpha   = a + i(b)
+    X       = x + i(y)
+
+    The computation is performed as follows:
+
+    Step 1:
+    -------
+
+    alpha_real_register = broadcast(alpha_real)
+    alpha_imag_register = broadcast(alpha_imag)
+
+    x_register = load(x)
+
+    Step 2:
+    -------
+
+    temp_storage_1 = x_register * alpha_real_register
+    temp_storage_2 = x_register * alpha_imag_register
+
+    temp_storage_1 = ax, ay
+    temp_storage_2 = bx, by
+
+    Step 3:
+    -------
+
+    In cases when X does NOT have to be conjugated,
+
+        swap_adjacent_elements(temp_storage_2)
+        temp_storage_2 = by, bx
+
+    In case X has to be conjugated,
+
+        swap_adjacent_elements(temp_storage_1)
+        temp_storage_1 = ay, ax
+
+    Step 4:
+    -------
+
+    In case of X conjugate the computation performed
+    will be,
+
+        result_compute = ax - by, ay + bx
+
+    In case of where there is no need to conjugate X
+    the computation performed will be,
+
+        result_compute =  bx - ay, by + ax
+
+    In case X has to be conjugated,
+
+        swap_adjacent_elements(result_computed)
+
+    Store the result to Y
+
+    Exception
+    ----------
+
+    1. When the vector dimension is zero return early
+
+
+    Compute reduction
+    ------------------
+
+    1. When alpha is zero (i.e. both real and imaginary are 0)
+       perform the compute as setting Y vector to zero using
+       setv
+    2. When alpha is one (i.e. real is 1 and imaginary are 0)
+       perform the compute as copying the X vector to Y vector
+       using copyv.
+
+    Underdefined
+    -------------
+
+    1. When incx or incy is passed as zero or less than zero,
+       the behaviour is not defined. In this kernel, we return
+       without performing any computation.
+*/
+
+void bli_zscal2v_zen_int
+     (
+       conj_t           conjx,
+       dim_t            n,
+       dcomplex*  restrict alpha,
+       dcomplex*  restrict x, inc_t incx,
+       dcomplex*  restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+
+    // If the vector dimension is zero, return early.
+    // When incx or incy is passed as zero or less than zero,
+    // the behaviour is not defined, so return early.
+    if (bli_zero_dim1(n)|| incx <= 0 || incy <=0)
+        return;
+
+    if (PASTEMAC(z, eq0)(*alpha))
+    {
+        /* If alpha is zero, use setv. */
+        dcomplex *zero = PASTEMAC(z, 0);
+
+        if(cntx == NULL) cntx = bli_gks_query_cntx();
+
+        /* Query the context for the kernel function pointer. */
+        const num_t dt = PASTEMAC(z, type);
+
+        PASTECH(z, setv_ker_ft)
+        setv_p = bli_cntx_get_l1v_ker_dt(dt, BLIS_SETV_KER, cntx);
+
+        setv_p
+        (
+            BLIS_NO_CONJUGATE,
+            n,
+            zero,
+            y, incy,
+            cntx
+        );
+
+        return;
+    }
+    else if (PASTEMAC(z, eq1)(*alpha))
+    {
+        /* If alpha is one, use copyv. */
+        bli_zcopyv_zen_int
+        (
+            conjx,
+            n,
+            x, incx,
+            y, incy,
+            cntx
+        );
+
+        return;
+    }
+
+    dim_t i;
+    dcomplex *x0 = x;
+    dcomplex *y0 = y;
+
+    double real = (*alpha).real;
+    double imag = (*alpha).imag;
+
+    if (bli_is_noconj(conjx))
+    {
+        if (incx == 1 && incy == 1)
+        {
+            __m256d temp[8], alpha_real, alpha_imag, x_vec[4];
+
+            alpha_real = _mm256_set1_pd(real);
+            alpha_imag = _mm256_set1_pd(imag);
+
+            const dim_t n_elem_per_reg = 2;
+
+            for (i = 0; (i + 7) < n; i += 8)
+            {
+                x_vec[0] = _mm256_loadu_pd((double *)x0);
+                x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+                x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg));
+                x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg));
+
+                temp[0] = _mm256_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm256_mul_pd(x_vec[0], alpha_imag);
+                temp[2] = _mm256_mul_pd(x_vec[1], alpha_real);
+                temp[3] = _mm256_mul_pd(x_vec[1], alpha_imag);
+                temp[4] = _mm256_mul_pd(x_vec[2], alpha_real);
+                temp[5] = _mm256_mul_pd(x_vec[2], alpha_imag);
+                temp[6] = _mm256_mul_pd(x_vec[3], alpha_real);
+                temp[7] = _mm256_mul_pd(x_vec[3], alpha_imag);
+
+                temp[1] = _mm256_permute_pd(temp[1], 0b0101);
+                temp[3] = _mm256_permute_pd(temp[3], 0b0101);
+                temp[5] = _mm256_permute_pd(temp[5], 0b0101);
+                temp[7] = _mm256_permute_pd(temp[7], 0b0101);
+
+                temp[0] = _mm256_addsub_pd(temp[0], temp[1]);
+                temp[2] = _mm256_addsub_pd(temp[2], temp[3]);
+                temp[4] = _mm256_addsub_pd(temp[4], temp[5]);
+                temp[6] = _mm256_addsub_pd(temp[6], temp[7]);
+
+                _mm256_storeu_pd((double *)y0, temp[0]);
+                _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), temp[2]);
+                _mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), temp[4]);
+                _mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), temp[6]);
+
+                x0 += 8;
+                y0 += 8;
+            }
+
+            for (; (i + 3) < n; i += 4)
+            {
+                x_vec[0] = _mm256_loadu_pd((double *)x0);
+                x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+
+                temp[0] = _mm256_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm256_mul_pd(x_vec[0], alpha_imag);
+                temp[2] = _mm256_mul_pd(x_vec[1], alpha_real);
+                temp[3] = _mm256_mul_pd(x_vec[1], alpha_imag);
+
+                temp[1] = _mm256_permute_pd(temp[1], 0b0101);
+                temp[3] = _mm256_permute_pd(temp[3], 0b0101);
+
+                temp[0] = _mm256_addsub_pd(temp[0], temp[1]);
+                temp[2] = _mm256_addsub_pd(temp[2], temp[3]);
+
+                _mm256_storeu_pd((double *)y0, temp[0]);
+                _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), temp[2]);
+
+                x0 += 4;
+                y0 += 4;
+            }
+
+            for (; (i + 1) < n; i += 2)
+            {
+                x_vec[0] = _mm256_loadu_pd((double *)x0);
+
+                temp[0] = _mm256_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm256_mul_pd(x_vec[0], alpha_imag);
+
+                temp[1] = _mm256_permute_pd(temp[1], 0b0101);
+
+                temp[0] = _mm256_addsub_pd(temp[0], temp[1]);
+
+                _mm256_storeu_pd((double *)y0, temp[0]);
+
+                x0 += 2;
+                y0 += 2;
+            }
+            _mm256_zeroupper();
+        }
+        /* This else condition handles the computation when
+           incx != 1 or incy ! = 1 for no conjugate X cases */
+        else
+        {
+            /* In double complex data type the computation of
+              unit stride elements can still be vectorized
+              using SSE instructions */
+            __m128d temp[8], alpha_real, alpha_imag, x_vec[4];
+
+            alpha_real = _mm_set1_pd(real);
+            alpha_imag = _mm_set1_pd(imag);
+
+            for (i = 0; (i + 3) < n; i += 4)
+            {
+                x_vec[0] = _mm_loadu_pd((double *)x0);
+                x_vec[1] = _mm_loadu_pd((double *)(x0 + incx));
+                x_vec[2] = _mm_loadu_pd((double *)(x0 + 2 * incx));
+                x_vec[3] = _mm_loadu_pd((double *)(x0 + 3 * incx));
+
+                temp[0] = _mm_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm_mul_pd(x_vec[0], alpha_imag);
+                temp[2] = _mm_mul_pd(x_vec[1], alpha_real);
+                temp[3] = _mm_mul_pd(x_vec[1], alpha_imag);
+                temp[4] = _mm_mul_pd(x_vec[2], alpha_real);
+                temp[5] = _mm_mul_pd(x_vec[2], alpha_imag);
+                temp[6] = _mm_mul_pd(x_vec[3], alpha_real);
+                temp[7] = _mm_mul_pd(x_vec[3], alpha_imag);
+
+                temp[1] = _mm_permute_pd(temp[1], 0b01);
+                temp[3] = _mm_permute_pd(temp[3], 0b01);
+                temp[5] = _mm_permute_pd(temp[5], 0b01);
+                temp[7] = _mm_permute_pd(temp[7], 0b01);
+
+                temp[0] = _mm_addsub_pd(temp[0], temp[1]);
+                temp[2] = _mm_addsub_pd(temp[2], temp[3]);
+                temp[4] = _mm_addsub_pd(temp[4], temp[5]);
+                temp[6] = _mm_addsub_pd(temp[6], temp[7]);
+
+                _mm_storeu_pd((double *)y0, temp[0]);
+                _mm_storeu_pd((double *)(y0 + incy), temp[2]);
+                _mm_storeu_pd((double *)(y0 + 2 * incy), temp[4]);
+                _mm_storeu_pd((double *)(y0 + 3 * incy), temp[6]);
+
+                x0 += 4 * incx;
+                y0 += 4 * incy;
+            }
+
+            for (; (i + 1) < n; i += 2)
+            {
+                x_vec[0] = _mm_loadu_pd((double *)x0);
+                x_vec[1] = _mm_loadu_pd((double *)(x0 + incx));
+
+                temp[0] = _mm_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm_mul_pd(x_vec[0], alpha_imag);
+                temp[2] = _mm_mul_pd(x_vec[1], alpha_real);
+                temp[3] = _mm_mul_pd(x_vec[1], alpha_imag);
+
+                temp[1] = _mm_permute_pd(temp[1], 0b01);
+                temp[3] = _mm_permute_pd(temp[3], 0b01);
+
+                temp[0] = _mm_addsub_pd(temp[0], temp[1]);
+                temp[2] = _mm_addsub_pd(temp[2], temp[3]);
+
+                _mm_storeu_pd((double *)y0, temp[0]);
+                _mm_storeu_pd((double *)(y0 + incy), temp[2]);
+
+                x0 += 2 * incx;
+                y0 += 2 * incy;
+            }
+        }
+
+        /* In double complex data type the computation of
+            unit stride elements can still be vectorized
+            using SSE instructions */
+        __m128d temp[2], alpha_real, alpha_imag, x_vec[1];
+
+        alpha_real = _mm_set1_pd(real);
+        alpha_imag = _mm_set1_pd(imag);
+
+        for (; i < n; i++)
+        {
+            x_vec[0] = _mm_loadu_pd((double *)x0);
+
+            temp[0] = _mm_mul_pd(x_vec[0], alpha_real);
+            temp[1] = _mm_mul_pd(x_vec[0], alpha_imag);
+
+            temp[1] = _mm_permute_pd(temp[1], 0b01);
+
+            temp[0] = _mm_addsub_pd(temp[0], temp[1]);
+
+            _mm_storeu_pd((double *)y0, temp[0]);
+
+            x0 += incx;
+            y0 += incy;
+        }
+    }
+    /* This else condition handles the computation
+        for conjugate X cases */
+    else
+    {
+        if (incx == 1 && incy == 1)
+        {
+            __m256d temp[8], alpha_real, alpha_imag, x_vec[4];
+
+            alpha_real = _mm256_set1_pd(real);
+            alpha_imag = _mm256_set1_pd(imag);
+
+            const dim_t n_elem_per_reg = 2;
+
+            for (i = 0; (i + 7) < n; i += 8)
+            {
+                x_vec[0] = _mm256_loadu_pd((double *)x0);
+                x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+                x_vec[2] = _mm256_loadu_pd((double *)(x0 + 2 * n_elem_per_reg));
+                x_vec[3] = _mm256_loadu_pd((double *)(x0 + 3 * n_elem_per_reg));
+
+                temp[0] = _mm256_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm256_mul_pd(x_vec[0], alpha_imag);
+                temp[2] = _mm256_mul_pd(x_vec[1], alpha_real);
+                temp[3] = _mm256_mul_pd(x_vec[1], alpha_imag);
+                temp[4] = _mm256_mul_pd(x_vec[2], alpha_real);
+                temp[5] = _mm256_mul_pd(x_vec[2], alpha_imag);
+                temp[6] = _mm256_mul_pd(x_vec[3], alpha_real);
+                temp[7] = _mm256_mul_pd(x_vec[3], alpha_imag);
+
+                temp[0] = _mm256_permute_pd(temp[0], 0b0101);
+                temp[2] = _mm256_permute_pd(temp[2], 0b0101);
+                temp[4] = _mm256_permute_pd(temp[4], 0b0101);
+                temp[6] = _mm256_permute_pd(temp[6], 0b0101);
+
+                temp[0] = _mm256_addsub_pd(temp[1], temp[0]);
+                temp[2] = _mm256_addsub_pd(temp[3], temp[2]);
+                temp[4] = _mm256_addsub_pd(temp[5], temp[4]);
+                temp[6] = _mm256_addsub_pd(temp[7], temp[6]);
+
+                temp[0] = _mm256_permute_pd(temp[0], 0b0101);
+                temp[2] = _mm256_permute_pd(temp[2], 0b0101);
+                temp[4] = _mm256_permute_pd(temp[4], 0b0101);
+                temp[6] = _mm256_permute_pd(temp[6], 0b0101);
+
+                _mm256_storeu_pd((double *)y0, temp[0]);
+                _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), temp[2]);
+                _mm256_storeu_pd((double *)(y0 + 2 * n_elem_per_reg), temp[4]);
+                _mm256_storeu_pd((double *)(y0 + 3 * n_elem_per_reg), temp[6]);
+
+                x0 += 8;
+                y0 += 8;
+            }
+
+            for (; (i + 3) < n; i += 4)
+            {
+                x_vec[0] = _mm256_loadu_pd((double *)x0);
+                x_vec[1] = _mm256_loadu_pd((double *)(x0 + n_elem_per_reg));
+
+                temp[0] = _mm256_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm256_mul_pd(x_vec[0], alpha_imag);
+                temp[2] = _mm256_mul_pd(x_vec[1], alpha_real);
+                temp[3] = _mm256_mul_pd(x_vec[1], alpha_imag);
+
+                temp[0] = _mm256_permute_pd(temp[0], 0b0101);
+                temp[2] = _mm256_permute_pd(temp[2], 0b0101);
+
+                temp[0] = _mm256_addsub_pd(temp[1], temp[0]);
+                temp[2] = _mm256_addsub_pd(temp[3], temp[2]);
+
+                temp[0] = _mm256_permute_pd(temp[0], 0b0101);
+                temp[2] = _mm256_permute_pd(temp[2], 0b0101);
+
+                _mm256_storeu_pd((double *)y0, temp[0]);
+                _mm256_storeu_pd((double *)(y0 + n_elem_per_reg), temp[2]);
+
+                x0 += 4;
+                y0 += 4;
+            }
+
+            for (; (i + 1) < n; i += 2)
+            {
+                x_vec[0] = _mm256_loadu_pd((double *)x0);
+
+                temp[0] = _mm256_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm256_mul_pd(x_vec[0], alpha_imag);
+
+                temp[0] = _mm256_permute_pd(temp[0], 0b0101);
+
+                temp[0] = _mm256_addsub_pd(temp[1], temp[0]);
+
+                temp[0] = _mm256_permute_pd(temp[0], 0b0101);
+
+                _mm256_storeu_pd((double *)y0, temp[0]);
+
+                x0 += 2;
+                y0 += 2;
+            }
+
+            _mm256_zeroupper();
+        }
+        /* This else condition handles the computation when
+           incx != 1 or incy != 1 for conjugate X cases */
+        else
+        {
+            /* In double complex data type the computation of
+            unit stride elements can still be vectorized
+            using SSE instructions */
+            __m128d temp[8], alpha_real, alpha_imag, x_vec[4];
+
+            alpha_real = _mm_set1_pd(real);
+            alpha_imag = _mm_set1_pd(imag);
+
+            for (i = 0; (i + 3) < n; i += 4)
+            {
+                x_vec[0] = _mm_loadu_pd((double *)x0);
+                x_vec[1] = _mm_loadu_pd((double *)(x0 + incx));
+                x_vec[2] = _mm_loadu_pd((double *)(x0 + 2 * incx));
+                x_vec[3] = _mm_loadu_pd((double *)(x0 + 3 * incx));
+
+                temp[0] = _mm_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm_mul_pd(x_vec[0], alpha_imag);
+                temp[2] = _mm_mul_pd(x_vec[1], alpha_real);
+                temp[3] = _mm_mul_pd(x_vec[1], alpha_imag);
+                temp[4] = _mm_mul_pd(x_vec[2], alpha_real);
+                temp[5] = _mm_mul_pd(x_vec[2], alpha_imag);
+                temp[6] = _mm_mul_pd(x_vec[3], alpha_real);
+                temp[7] = _mm_mul_pd(x_vec[3], alpha_imag);
+
+                temp[0] = _mm_permute_pd(temp[0], 0b01);
+                temp[2] = _mm_permute_pd(temp[2], 0b01);
+                temp[4] = _mm_permute_pd(temp[4], 0b01);
+                temp[6] = _mm_permute_pd(temp[6], 0b01);
+
+                temp[0] = _mm_addsub_pd(temp[1], temp[0]);
+                temp[2] = _mm_addsub_pd(temp[3], temp[2]);
+                temp[4] = _mm_addsub_pd(temp[5], temp[4]);
+                temp[6] = _mm_addsub_pd(temp[7], temp[6]);
+
+                temp[0] = _mm_permute_pd(temp[0], 0b01);
+                temp[2] = _mm_permute_pd(temp[2], 0b01);
+                temp[4] = _mm_permute_pd(temp[4], 0b01);
+                temp[6] = _mm_permute_pd(temp[6], 0b01);
+
+                _mm_storeu_pd((double *)y0, temp[0]);
+                _mm_storeu_pd((double *)(y0 + incy), temp[2]);
+                _mm_storeu_pd((double *)(y0 + 2 * incy), temp[4]);
+                _mm_storeu_pd((double *)(y0 + 3 * incy), temp[6]);
+
+                x0 += 4 * incx;
+                y0 += 4 * incy;
+            }
+
+            for (; (i + 1) < n; i += 2)
+            {
+                x_vec[0] = _mm_loadu_pd((double *)x0);
+                x_vec[1] = _mm_loadu_pd((double *)(x0 + incx));
+
+                temp[0] = _mm_mul_pd(x_vec[0], alpha_real);
+                temp[1] = _mm_mul_pd(x_vec[0], alpha_imag);
+                temp[2] = _mm_mul_pd(x_vec[1], alpha_real);
+                temp[3] = _mm_mul_pd(x_vec[1], alpha_imag);
+
+                temp[0] = _mm_permute_pd(temp[0], 0b01);
+                temp[2] = _mm_permute_pd(temp[2], 0b01);
+
+                temp[0] = _mm_addsub_pd(temp[1], temp[0]);
+                temp[2] = _mm_addsub_pd(temp[3], temp[2]);
+
+                temp[0] = _mm_permute_pd(temp[0], 0b01);
+                temp[2] = _mm_permute_pd(temp[2], 0b01);
+
+                _mm_storeu_pd((double *)y0, temp[0]);
+                _mm_storeu_pd((double *)(y0 + incy), temp[2]);
+
+                x0 += 2 * incx;
+                y0 += 2 * incy;
+            }
+        }
+
+        /* In double complex data type the computation of
+          unit stride elements can still be vectorized */
+        __m128d temp[2], alpha_real, alpha_imag, x_vec[1];
+
+        alpha_real = _mm_set1_pd(real);
+        alpha_imag = _mm_set1_pd(imag);
+
+        for (; i < n; ++i)
+        {
+            x_vec[0] = _mm_loadu_pd((double *)x0);
+
+            temp[0] = _mm_mul_pd(x_vec[0], alpha_real);
+            temp[1] = _mm_mul_pd(x_vec[0], alpha_imag);
+
+            temp[0] = _mm_permute_pd(temp[0], 0b01);
+
+            temp[0] = _mm_addsub_pd(temp[1], temp[0]);
+
+            temp[0] = _mm_permute_pd(temp[0], 0b01);
+
+            _mm_storeu_pd((double *)y0, temp[0]);
+
+            x0 += incx;
+            y0 += incy;
+        }
+    }
+}
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index 7146e86879..2d96d756c1 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2017 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2017 - 2023, Advanced Micro Devices, Inc. All rights reserved.
    Copyright (C) 2018, The University of Texas at Austin
 
    Redistribution and use in source and binary forms, with or without
@@ -578,3 +578,420 @@ void bli_dscalv_zen_int10
 	}
 }
 
+void bli_zdscalv_zen_int10
+     (
+       conj_t           conjalpha,
+       dim_t            n,
+       dcomplex* restrict alpha,
+       dcomplex* restrict x, inc_t incx,
+       cntx_t* restrict cntx
+     )
+{
+	dim_t i = 0;
+	const dim_t n_elem_per_reg = 4;    // number of elements per register
+
+	double* restrict x0 = (double*) x;
+
+	/*
+		This kernel only performs the computation
+		when alpha is double from the BLAS layer
+		alpha is passed as double complex to adhere
+		to function pointer definition in BLIS
+	*/
+	const double alphac = (*alpha).real;
+
+	if ( incx == 1 )
+	{
+		__m256d alphav;
+		__m256d xv[15];
+
+		alphav = _mm256_broadcast_sd( &alphac );
+
+		for ( ; ( i + 29 ) < n; i += 30 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4 * n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( x0 + 5 * n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( x0 + 6 * n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( x0 + 7 * n_elem_per_reg );
+			xv[8] = _mm256_loadu_pd( x0 + 8 * n_elem_per_reg );
+			xv[9] = _mm256_loadu_pd( x0 + 9 * n_elem_per_reg );
+			xv[10] = _mm256_loadu_pd( x0 + 10 * n_elem_per_reg );
+			xv[11] = _mm256_loadu_pd( x0 + 11 * n_elem_per_reg );
+			xv[12] = _mm256_loadu_pd( x0 + 12 * n_elem_per_reg );
+			xv[13] = _mm256_loadu_pd( x0 + 13 * n_elem_per_reg );
+			xv[14] = _mm256_loadu_pd( x0 + 14 * n_elem_per_reg );			
+
+			xv[0] = _mm256_mul_pd( alphav, xv[0] );
+			xv[1] = _mm256_mul_pd( alphav, xv[1] );
+			xv[2] = _mm256_mul_pd( alphav, xv[2] );
+			xv[3] = _mm256_mul_pd( alphav, xv[3] );
+			xv[4] = _mm256_mul_pd( alphav, xv[4] );
+			xv[5] = _mm256_mul_pd( alphav, xv[5] );
+			xv[6] = _mm256_mul_pd( alphav, xv[6] );
+			xv[7] = _mm256_mul_pd( alphav, xv[7] );
+			xv[8] = _mm256_mul_pd( alphav, xv[8] );
+			xv[9] = _mm256_mul_pd( alphav, xv[9] );
+			xv[10] = _mm256_mul_pd( alphav, xv[10] );
+			xv[11] = _mm256_mul_pd( alphav, xv[11] );
+			xv[12] = _mm256_mul_pd( alphav, xv[12] );
+			xv[13] = _mm256_mul_pd( alphav, xv[13] );
+			xv[14] = _mm256_mul_pd( alphav, xv[14] );
+			
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] );
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), xv[2] );
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), xv[3] );
+			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), xv[4] );
+			_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), xv[5] );
+			_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), xv[6] );
+			_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), xv[7] );
+			_mm256_storeu_pd( (x0 + 8*n_elem_per_reg), xv[8] );
+			_mm256_storeu_pd( (x0 + 9*n_elem_per_reg), xv[9] );
+			_mm256_storeu_pd( (x0 + 10*n_elem_per_reg), xv[10] );
+			_mm256_storeu_pd( (x0 + 11*n_elem_per_reg), xv[11] );
+			_mm256_storeu_pd( (x0 + 12*n_elem_per_reg), xv[12] );
+			_mm256_storeu_pd( (x0 + 13*n_elem_per_reg), xv[13] );
+			_mm256_storeu_pd( (x0 + 14*n_elem_per_reg), xv[14] );
+
+			x0 += 15 * n_elem_per_reg;
+		}
+
+		for ( ; ( i + 23 ) < n; i += 24 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4 * n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( x0 + 5 * n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( x0 + 6 * n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( x0 + 7 * n_elem_per_reg );
+			xv[8] = _mm256_loadu_pd( x0 + 8 * n_elem_per_reg );
+			xv[9] = _mm256_loadu_pd( x0 + 9 * n_elem_per_reg );
+			xv[10] = _mm256_loadu_pd( x0 + 10 * n_elem_per_reg );
+			xv[11] = _mm256_loadu_pd( x0 + 11 * n_elem_per_reg );
+
+			xv[0] = _mm256_mul_pd( alphav, xv[0] );
+			xv[1] = _mm256_mul_pd( alphav, xv[1] );
+			xv[2] = _mm256_mul_pd( alphav, xv[2] );
+			xv[3] = _mm256_mul_pd( alphav, xv[3] );
+			xv[4] = _mm256_mul_pd( alphav, xv[4] );
+			xv[5] = _mm256_mul_pd( alphav, xv[5] );
+			xv[6] = _mm256_mul_pd( alphav, xv[6] );
+			xv[7] = _mm256_mul_pd( alphav, xv[7] );
+			xv[8] = _mm256_mul_pd( alphav, xv[8] );
+			xv[9] = _mm256_mul_pd( alphav, xv[9] );
+			xv[10] = _mm256_mul_pd( alphav, xv[10] );
+			xv[11] = _mm256_mul_pd( alphav, xv[11] );
+			
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] );
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), xv[2] );
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), xv[3] );
+			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), xv[4] );
+			_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), xv[5] );
+			_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), xv[6] );
+			_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), xv[7] );
+			_mm256_storeu_pd( (x0 + 8*n_elem_per_reg), xv[8] );
+			_mm256_storeu_pd( (x0 + 9*n_elem_per_reg), xv[9] );
+			_mm256_storeu_pd( (x0 + 10*n_elem_per_reg), xv[10] );
+			_mm256_storeu_pd( (x0 + 11*n_elem_per_reg), xv[11] );
+
+			x0 += 12 * n_elem_per_reg;
+		}
+
+		for ( ; ( i + 15 ) < n; i += 16 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg );
+			xv[4] = _mm256_loadu_pd( x0 + 4 * n_elem_per_reg );
+			xv[5] = _mm256_loadu_pd( x0 + 5 * n_elem_per_reg );
+			xv[6] = _mm256_loadu_pd( x0 + 6 * n_elem_per_reg );
+			xv[7] = _mm256_loadu_pd( x0 + 7 * n_elem_per_reg );
+
+			xv[0] = _mm256_mul_pd( alphav, xv[0] );
+			xv[1] = _mm256_mul_pd( alphav, xv[1] );
+			xv[2] = _mm256_mul_pd( alphav, xv[2] );
+			xv[3] = _mm256_mul_pd( alphav, xv[3] );
+			xv[4] = _mm256_mul_pd( alphav, xv[4] );
+			xv[5] = _mm256_mul_pd( alphav, xv[5] );
+			xv[6] = _mm256_mul_pd( alphav, xv[6] );
+			xv[7] = _mm256_mul_pd( alphav, xv[7] );
+			
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] );
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), xv[2] );
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), xv[3] );
+			_mm256_storeu_pd( (x0 + 4*n_elem_per_reg), xv[4] );
+			_mm256_storeu_pd( (x0 + 5*n_elem_per_reg), xv[5] );
+			_mm256_storeu_pd( (x0 + 6*n_elem_per_reg), xv[6] );
+			_mm256_storeu_pd( (x0 + 7*n_elem_per_reg), xv[7] );
+
+			x0 += 8 * n_elem_per_reg;
+		}
+
+		for ( ; ( i + 7 ) < n; i += 8 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg );
+			xv[2] = _mm256_loadu_pd( x0 + 2 * n_elem_per_reg );
+			xv[3] = _mm256_loadu_pd( x0 + 3 * n_elem_per_reg );
+
+			xv[0] = _mm256_mul_pd( alphav, xv[0] );
+			xv[1] = _mm256_mul_pd( alphav, xv[1] );
+			xv[2] = _mm256_mul_pd( alphav, xv[2] );
+			xv[3] = _mm256_mul_pd( alphav, xv[3] );
+			
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] );
+			_mm256_storeu_pd( (x0 + 2*n_elem_per_reg), xv[2] );
+			_mm256_storeu_pd( (x0 + 3*n_elem_per_reg), xv[3] );
+
+			x0 += 4 * n_elem_per_reg;
+		}
+
+		for ( ; ( i + 3 ) < n; i += 4 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg );
+			xv[1] = _mm256_loadu_pd( x0 + 1 * n_elem_per_reg );
+
+			xv[0] = _mm256_mul_pd( alphav, xv[0] );
+			xv[1] = _mm256_mul_pd( alphav, xv[1] );
+			
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] );
+			_mm256_storeu_pd( (x0 + 1*n_elem_per_reg), xv[1] );
+
+			x0 += 2 * n_elem_per_reg;
+		}
+
+		for ( ; ( i + 1 ) < n; i += 2 )
+		{
+			xv[0] = _mm256_loadu_pd( x0 + 0 * n_elem_per_reg );
+
+			xv[0] = _mm256_mul_pd( alphav, xv[0] );
+
+			_mm256_storeu_pd( (x0 + 0*n_elem_per_reg), xv[0] );
+
+			x0 += 1 * n_elem_per_reg;
+		}
+
+		for ( ; i < n; i++ )
+		{
+			( *x0 )			*= alphac;
+			( *( x0 + 1 ) ) *= alphac;
+
+			x0 += 2 * incx;
+		}
+
+		// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+		// This avoids a performance penalty caused by false dependencies when
+		// transitioning from AVX to SSE instructions (which may occur as soon
+		// as the n_left cleanup loop below if BLIS is compiled with
+		// -mfpmath=sse).
+		_mm256_zeroupper();
+	}
+	else
+	{
+		for ( ; i < n; ++i )
+		{
+			( *x0 )			*= alphac;
+			( *( x0 + 1 ) ) *= alphac;
+
+			x0 += 2 * incx;
+		}
+	}
+}
+
+void bli_zscalv_zen_int
+	(
+	conj_t           conjalpha,
+	dim_t            n,
+	dcomplex* restrict alpha,
+	dcomplex* restrict x, inc_t incx,
+	cntx_t* restrict cntx
+	)
+{
+	// If the vector dimension is zero, or if alpha is unit, return early.
+	if (bli_zero_dim1(n) || PASTEMAC(z, eq1)(*alpha))
+		return;
+
+	if (PASTEMAC(z, eq0)(*alpha))
+	{
+		// Expert interface of setv is invoked when alpha is zero
+		dcomplex *zero = PASTEMAC(z, 0);
+
+		/* When alpha is zero all the element in x are set to zero */
+		PASTEMAC2(z, setv, BLIS_TAPI_EX_SUF)
+		(
+			BLIS_NO_CONJUGATE,
+			n,
+			zero,
+			x, incx,
+			cntx,
+			NULL
+		);
+
+		return;
+	}
+
+	dim_t i = 0;
+	dcomplex alpha_conj;
+	double *x0 = (double *)x;
+
+	// Performs conjugation of alpha based on conjalpha
+	PASTEMAC(z, copycjs)(conjalpha, *alpha, alpha_conj)
+
+	double real = alpha_conj.real;
+	double imag = alpha_conj.imag;
+
+	/*When incx is 1 and n >= 2 it is possible to use AVX2 instructions*/
+	if (incx == 1 && n >= 2)
+	{
+		dim_t const n_elem_per_reg = 4;
+
+		__m256d alpha_real_ymm, alpha_imag_ymm;
+
+		alpha_real_ymm = _mm256_broadcast_sd(&real);
+		alpha_imag_ymm = _mm256_broadcast_sd(&imag);
+
+		__m256d x_vec_ymm[4], temp_ymm[8];
+
+		/*  Code logic
+
+			Consider,
+			x1= a1 + ib1, x2 = a1 + ib2
+			alpha = p + iq
+
+			Vector values
+			x_vec_ymm = a1, b1, a2, b2
+			alpha_real_ymm = p, p, p, p
+			alpha_imag_ymm = q, q, q, q
+
+			Computation
+
+			All real values
+			temp_1 = x_vec_ymm * alpha_real_ymm = a1p, b1p, a2p, b2p
+
+			All imaginary values
+			temp_2 = x_vec_ymm * alpha_imag_ymm = a1q, b1q, a2q, b2q
+
+			permute temp_2 to get
+
+			b1q, a1q, b2q, a2q
+
+			addsub temp_1 and temp_2 to get the final result
+			and then store
+		*/
+
+		for (; (i + 7) < n; i += 8)
+		{
+			x_vec_ymm[0] = _mm256_loadu_pd(x0);
+			x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
+			x_vec_ymm[2] = _mm256_loadu_pd(x0 + 2 * n_elem_per_reg);
+			x_vec_ymm[3] = _mm256_loadu_pd(x0 + 3 * n_elem_per_reg);
+
+			temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_real_ymm);
+			temp_ymm[1] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm);
+			temp_ymm[2] = _mm256_mul_pd(x_vec_ymm[1], alpha_real_ymm);
+			temp_ymm[3] = _mm256_mul_pd(x_vec_ymm[1], alpha_imag_ymm);
+			temp_ymm[4] = _mm256_mul_pd(x_vec_ymm[2], alpha_real_ymm);
+			temp_ymm[5] = _mm256_mul_pd(x_vec_ymm[2], alpha_imag_ymm);
+			temp_ymm[6] = _mm256_mul_pd(x_vec_ymm[3], alpha_real_ymm);
+			temp_ymm[7] = _mm256_mul_pd(x_vec_ymm[3], alpha_imag_ymm);
+
+			temp_ymm[1] = _mm256_permute_pd(temp_ymm[1], 0b0101);
+			temp_ymm[3] = _mm256_permute_pd(temp_ymm[3], 0b0101);
+			temp_ymm[5] = _mm256_permute_pd(temp_ymm[5], 0b0101);
+			temp_ymm[7] = _mm256_permute_pd(temp_ymm[7], 0b0101);
+
+			/*
+				a[i+63:i] := b[i+63:i] - c[i+63:i] for odd indices
+				a[i+63:i] := b[i+63:i] + c[i+63:i] for even indices
+			*/
+			temp_ymm[0] = _mm256_addsub_pd(temp_ymm[0], temp_ymm[1]);
+			temp_ymm[2] = _mm256_addsub_pd(temp_ymm[2], temp_ymm[3]);
+			temp_ymm[4] = _mm256_addsub_pd(temp_ymm[4], temp_ymm[5]);
+			temp_ymm[6] = _mm256_addsub_pd(temp_ymm[6], temp_ymm[7]);
+
+			_mm256_storeu_pd(x0, temp_ymm[0]);
+			_mm256_storeu_pd(x0 + n_elem_per_reg, temp_ymm[2]);
+			_mm256_storeu_pd(x0 + 2 * n_elem_per_reg, temp_ymm[4]);
+			_mm256_storeu_pd(x0 + 3 * n_elem_per_reg, temp_ymm[6]);
+
+			x0 += 4 * n_elem_per_reg;
+		}
+
+		for (; (i + 3) < n; i += 4)
+		{
+			x_vec_ymm[0] = _mm256_loadu_pd(x0);
+			x_vec_ymm[1] = _mm256_loadu_pd(x0 + n_elem_per_reg);
+
+			temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_real_ymm);
+			temp_ymm[1] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm);
+			temp_ymm[2] = _mm256_mul_pd(x_vec_ymm[1], alpha_real_ymm);
+			temp_ymm[3] = _mm256_mul_pd(x_vec_ymm[1], alpha_imag_ymm);
+
+			temp_ymm[1] = _mm256_permute_pd(temp_ymm[1], 0b0101);
+			temp_ymm[3] = _mm256_permute_pd(temp_ymm[3], 0b0101);
+
+			temp_ymm[0] = _mm256_addsub_pd(temp_ymm[0], temp_ymm[1]);
+			temp_ymm[2] = _mm256_addsub_pd(temp_ymm[2], temp_ymm[3]);
+
+			_mm256_storeu_pd(x0, temp_ymm[0]);
+			_mm256_storeu_pd(x0 + n_elem_per_reg, temp_ymm[2]);
+
+			x0 += 2 * n_elem_per_reg;
+		}
+
+		for (; (i + 1) < n; i += 2)
+		{
+			x_vec_ymm[0] = _mm256_loadu_pd(x0);
+
+			temp_ymm[0] = _mm256_mul_pd(x_vec_ymm[0], alpha_real_ymm);
+			temp_ymm[1] = _mm256_mul_pd(x_vec_ymm[0], alpha_imag_ymm);
+
+			temp_ymm[1] = _mm256_permute_pd(temp_ymm[1], 0b0101);
+
+			temp_ymm[0] = _mm256_addsub_pd(temp_ymm[0], temp_ymm[1]);
+
+			_mm256_storeu_pd(x0, temp_ymm[0]);
+
+			x0 += n_elem_per_reg;
+		}
+	}
+
+	// Issue vzeroupper instruction to clear upper lanes of ymm registers.
+	// This avoids a performance penalty caused by false dependencies when
+	// transitioning from AVX to SSE instructions (which may occur later,
+	// especially if BLIS is compiled with -mfpmath=sse).
+	_mm256_zeroupper();
+
+	/* In double complex data type the computation of
+	unit stride elements can still be vectorized using SSE*/
+	__m128d temp_ymm[2], alpha_real_ymm, alpha_imag_ymm, x_vec_ymm;
+
+	alpha_real_ymm = _mm_set1_pd(real);
+	alpha_imag_ymm = _mm_set1_pd(imag);
+
+	for (; i < n; i++)
+	{
+		x_vec_ymm = _mm_loadu_pd(x0);
+
+		temp_ymm[0] = _mm_mul_pd(x_vec_ymm, alpha_real_ymm);
+		temp_ymm[1] = _mm_mul_pd(x_vec_ymm, alpha_imag_ymm);
+
+		temp_ymm[1] = _mm_permute_pd(temp_ymm[1], 0b01);
+
+		temp_ymm[0] = _mm_addsub_pd(temp_ymm[0], temp_ymm[1]);
+
+		_mm_storeu_pd(x0, temp_ymm[0]);
+
+		x0 += 2 * incx;
+	}
+}
diff --git a/kernels/zen/1f/CMakeLists.txt b/kernels/zen/1f/CMakeLists.txt
index 3a77f69ef1..5da0c9e7b0 100644
--- a/kernels/zen/1f/CMakeLists.txt
+++ b/kernels/zen/1f/CMakeLists.txt
@@ -1,12 +1,16 @@
 ##Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}"
-     PRIVATE
+add_library(zen_1f
+     OBJECT
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_8.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxf_zen_int_8.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_5.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_4.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyf_zen_int_6.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpy2v_zen_int.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_zen_int_8.c           
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotxaxpyf_zen_int_8.c
     )
+target_compile_options(zen_1f PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen_1f PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/zen/1f/bli_axpy2v_zen_int.c b/kernels/zen/1f/bli_axpy2v_zen_int.c
index cba0141376..ba92066a43 100644
--- a/kernels/zen/1f/bli_axpy2v_zen_int.c
+++ b/kernels/zen/1f/bli_axpy2v_zen_int.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2018, The University of Texas at Austin
-   Copyright (C) 2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -539,8 +539,8 @@ void bli_zaxpy2v_zen_int
 
         // Issue vzeroupper instruction to clear upper lanes of ymm registers.
         // This avoids a performance penalty caused by false dependencies when
-        // transitioning from from AVX to SSE instructions (which may occur
-        // as soon as the n_left cleanup loop below if BLIS is compiled with
+        // transitioning from AVX to SSE instructions (which may occur as soon
+        // as the n_left cleanup loop below if BLIS is compiled with
         // -mfpmath=sse).
         _mm256_zeroupper();
 
@@ -718,4 +718,4 @@ void bli_zaxpy2v_zen_int
         }
     }
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
-}
\ No newline at end of file
+}
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c
index bb24e6c52f..36d94712aa 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -304,106 +304,154 @@ void bli_zaxpyf_zen_int_4
        cntx_t* restrict cntx
      )
 {
-    inc_t fuse_fac = 4;
-    inc_t i;
-
-    v4df_t ymm0, ymm1, ymm2, ymm3;
-    v4df_t ymm4, ymm5, ymm6, ymm7;
-    v4df_t ymm8, ymm10;
-    v4df_t ymm12, ymm13;
-
-    double* ap[4];
-    double* y0 = (double*)y;
-
-    dcomplex            chi0;
-    dcomplex            chi1;
-    dcomplex            chi2;
-    dcomplex            chi3;
-
-    dim_t setPlusOne = 1;
+    dim_t fuse_fac = 4;
 
-    if ( bli_is_conj(conja) )
-    {
-        setPlusOne = -1;
-    }
     // If either dimension is zero, or if alpha is zero, return early.
     if ( bli_zero_dim2( m, b_n ) || bli_zeq0( *alpha ) ) return;
 
     // If b_n is not equal to the fusing factor, then perform the entire
     // operation as a loop over axpyv.
-    if ( b_n != fuse_fac )
+    if (b_n != fuse_fac)
     {
-        zaxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DCOMPLEX, BLIS_AXPYV_KER, cntx );
+        __m128d x_vec, alpha_real, alpha_imag, temp[2];
 
-        for ( i = 0; i < b_n; ++i )
+        alpha_real = _mm_set1_pd(((*alpha).real));
+        alpha_imag = _mm_set1_pd(((*alpha).imag));
+
+        for (dim_t i = 0; i < b_n; ++i)
         {
-            dcomplex* a1   = a + (0  )*inca + (i  )*lda;
-            dcomplex* chi1 = x + (i  )*incx;
-            dcomplex* y1   = y + (0  )*incy;
-            dcomplex  alpha_chi1;
+            dcomplex *a1 = a + (0) * inca + (i)*lda;
+            dcomplex *chi1 = x + (i)*incx;
+            dcomplex *y1 = y + (0) * incy;
+            dcomplex alpha_chi1;
 
-            bli_zcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_zscals( *alpha, alpha_chi1 );
+            // Vectorization of scaling X by alpha
+            x_vec = _mm_loadu_pd((double *)chi1);
 
-            f
+            if (bli_is_conj(conjx))
+            {
+                __m128d identity;
+
+                identity = _mm_setr_pd(1, -1);
+
+                x_vec = _mm_mul_pd(x_vec, identity);
+            }
+
+            temp[0] = _mm_mul_pd(x_vec, alpha_real);
+            temp[1] = _mm_mul_pd(x_vec, alpha_imag);
+
+            temp[1] = _mm_permute_pd(temp[1], 0b01);
+
+            temp[0] = _mm_addsub_pd(temp[0], temp[1]);
+
+            _mm_storeu_pd((double *)&alpha_chi1, temp[0]);
+
+            bli_zaxpyv_zen_int5
             (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
+                conja,
+                m,
+                &alpha_chi1,
+                a1, inca,
+                y1, incy,
+                cntx
             );
         }
 
         return;
     }
 
+    // A prefetch distance used inside the main loop
+    const dim_t distance = 32;
 
-    // At this point, we know that b_n is exactly equal to the fusing factor.
-    if(bli_is_noconj(conjx))
+    dcomplex chi0 = *(x + 0 * incx);
+    dcomplex chi1 = *(x + 1 * incx);
+    dcomplex chi2 = *(x + 2 * incx);
+    dcomplex chi3 = *(x + 3 * incx);
+
+    /* Alpha scaling of X can be vectorized
+       irrespective of the incx  and should
+       be avoided when alpha is 1*/
+    __m128d x_vec[8], alpha_real, alpha_imag, temp[8];
+
+    x_vec[0] = _mm_loadu_pd((double *)&chi0);
+    x_vec[1] = _mm_loadu_pd((double *)&chi1);
+    x_vec[2] = _mm_loadu_pd((double *)&chi2);
+    x_vec[3] = _mm_loadu_pd((double *)&chi3);
+
+    if (bli_is_conj(conjx))
     {
-        chi0 = *( x + 0*incx );
-        chi1 = *( x + 1*incx );
-        chi2 = *( x + 2*incx );
-        chi3 = *( x + 3*incx );
+        __m128d identity;
+
+        identity = _mm_setr_pd(1, -1);
+
+        x_vec[0] = _mm_mul_pd(x_vec[0], identity);
+        x_vec[1] = _mm_mul_pd(x_vec[1], identity);
+        x_vec[2] = _mm_mul_pd(x_vec[2], identity);
+        x_vec[3] = _mm_mul_pd(x_vec[3], identity);
+    }
+
+    if (!(bli_zeq1(*alpha)))
+    {
+        alpha_real = _mm_set1_pd(((*alpha).real));
+        alpha_imag = _mm_set1_pd(((*alpha).imag));
+
+        temp[0] = _mm_mul_pd(x_vec[0], alpha_real);
+        temp[1] = _mm_mul_pd(x_vec[0], alpha_imag);
+        temp[2] = _mm_mul_pd(x_vec[1], alpha_real);
+        temp[3] = _mm_mul_pd(x_vec[1], alpha_imag);
+        temp[4] = _mm_mul_pd(x_vec[2], alpha_real);
+        temp[5] = _mm_mul_pd(x_vec[2], alpha_imag);
+        temp[6] = _mm_mul_pd(x_vec[3], alpha_real);
+        temp[7] = _mm_mul_pd(x_vec[3], alpha_imag);
+
+        temp[1] = _mm_permute_pd(temp[1], 0b01);
+        temp[3] = _mm_permute_pd(temp[3], 0b01);
+        temp[5] = _mm_permute_pd(temp[5], 0b01);
+        temp[7] = _mm_permute_pd(temp[7], 0b01);
+
+        temp[0] = _mm_addsub_pd(temp[0], temp[1]);
+        temp[2] = _mm_addsub_pd(temp[2], temp[3]);
+        temp[4] = _mm_addsub_pd(temp[4], temp[5]);
+        temp[6] = _mm_addsub_pd(temp[6], temp[7]);
+
+        _mm_storeu_pd((double *)&chi0, temp[0]);
+        _mm_storeu_pd((double *)&chi1, temp[2]);
+        _mm_storeu_pd((double *)&chi2, temp[4]);
+        _mm_storeu_pd((double *)&chi3, temp[6]);
     }
     else
     {
-        dcomplex *pchi0 = x + 0*incx ;
-        dcomplex *pchi1 = x + 1*incx ;
-        dcomplex *pchi2 = x + 2*incx ;
-        dcomplex *pchi3 = x + 3*incx ;
-
-        bli_zcopycjs( conjx, *pchi0, chi0 );
-        bli_zcopycjs( conjx, *pchi1, chi1 );
-        bli_zcopycjs( conjx, *pchi2, chi2 );
-        bli_zcopycjs( conjx, *pchi3, chi3 );
+        _mm_storeu_pd((double *)&chi0, x_vec[0]);
+        _mm_storeu_pd((double *)&chi1, x_vec[1]);
+        _mm_storeu_pd((double *)&chi2, x_vec[2]);
+        _mm_storeu_pd((double *)&chi3, x_vec[3]);
     }
 
-    // Scale each chi scalar by alpha.
-    bli_zscals( *alpha, chi0 );
-    bli_zscals( *alpha, chi1 );
-    bli_zscals( *alpha, chi2 );
-    bli_zscals( *alpha, chi3 );
+    dim_t i = 0;
 
-    lda *= 2;
-    incx *= 2;
-    incy *= 2;
-    inca *= 2;
+    double *a_ptr[4];
+    double *y0 = (double *)y;
 
-    ap[0] = (double*)a;
-    ap[1] = (double*)a + lda;
-    ap[2] = ap[1] + lda;
-    ap[3] = ap[2] + lda;
+    a_ptr[0] = (double *)a;
+    a_ptr[1] = (double *)a + 2 * lda;
+    a_ptr[2] = a_ptr[1] + 2 * lda;
+    a_ptr[3] = a_ptr[2] + 2 * lda;
 
-    if( inca == 2 && incy == 2 )
+
+    // Prefetching the elements of A to the L1 cache.
+    // These will be used even if SSE instructions are used
+    _mm_prefetch(a_ptr[0], _MM_HINT_T1);
+    _mm_prefetch(a_ptr[1], _MM_HINT_T1);
+    _mm_prefetch(a_ptr[2], _MM_HINT_T1);
+    _mm_prefetch(a_ptr[3], _MM_HINT_T1);
+
+    if (inca == 1 && incy == 1)
     {
-        inc_t n1 = m >> 1; // Divide by 2
-        inc_t n2 = m & 1; // % 2
 
-        ymm12.v = _mm256_setzero_pd();
-        ymm13.v = _mm256_setzero_pd();
+        v4df_t ymm0, ymm1, ymm2, ymm3;
+        v4df_t ymm4, ymm5, ymm6, ymm7;
+        v4df_t ymm8, ymm10;
+        v4df_t ymm12, ymm13, ymm14, ymm15;
 
         // broadcast real & imag parts of 4 elements of x
         ymm0.v = _mm256_broadcast_sd(&chi0.real); // real part of x0
@@ -415,114 +463,358 @@ void bli_zaxpyf_zen_int_4
         ymm6.v = _mm256_broadcast_sd(&chi3.real); // real part of x3
         ymm7.v = _mm256_broadcast_sd(&chi3.imag); // imag part of x3
 
-
-        for(i = 0; i < n1; i++)
+        if (bli_is_noconj(conja))
         {
-            //load first two columns of A
-            ymm8.v  = _mm256_loadu_pd(ap[0] + 0); // 2 complex values form a0
-            ymm10.v = _mm256_loadu_pd(ap[1] + 0); // 2 complex values form a0
 
-            ymm12.v = _mm256_mul_pd(ymm8.v, ymm0.v);
-            ymm13.v = _mm256_mul_pd(ymm8.v, ymm1.v);
+            for (; (i + 3) < m; i += 4)
+            {
+                // load first two columns of A
+                ymm8.v = _mm256_loadu_pd(a_ptr[0]);  // 2 complex values from a0
+                ymm10.v = _mm256_loadu_pd(a_ptr[1]); // 2 complex values from a0
+                // load 3rd and 4th columns of A
+                ymm14.v = _mm256_loadu_pd(a_ptr[2]);
+                ymm15.v = _mm256_loadu_pd(a_ptr[3]);
 
-            ymm12.v = _mm256_fmadd_pd(ymm10.v, ymm2.v, ymm12.v);
-            ymm13.v = _mm256_fmadd_pd(ymm10.v, ymm3.v, ymm13.v);
+                // Multiply the loaded columns of A by X
+                ymm12.v = _mm256_mul_pd(ymm8.v, ymm0.v);
+                ymm13.v = _mm256_mul_pd(ymm8.v, ymm1.v);
 
-            //load 3rd and 4th columns of A
-            ymm8.v = _mm256_loadu_pd(ap[2] + 0);
-            ymm10.v = _mm256_loadu_pd(ap[3] + 0);
+                ymm12.v = _mm256_fmadd_pd(ymm10.v, ymm2.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm10.v, ymm3.v, ymm13.v);
 
-            ymm12.v = _mm256_fmadd_pd(ymm8.v, ymm4.v, ymm12.v);
-            ymm13.v = _mm256_fmadd_pd(ymm8.v, ymm5.v, ymm13.v);
+                _mm_prefetch(a_ptr[0] + distance, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[1] + distance, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[2] + distance, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[3] + distance, _MM_HINT_T1);
 
-            ymm12.v = _mm256_fmadd_pd(ymm10.v, ymm6.v, ymm12.v);
-            ymm13.v = _mm256_fmadd_pd(ymm10.v, ymm7.v, ymm13.v);
+                ymm12.v = _mm256_fmadd_pd(ymm14.v, ymm4.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm14.v, ymm5.v, ymm13.v);
 
-            //load Y vector
-            ymm10.v = _mm256_loadu_pd(y0 + 0);
+                _mm_prefetch(y0 + distance, _MM_HINT_T1);
 
-            if(bli_is_noconj(conja))
+                ymm12.v = _mm256_fmadd_pd(ymm15.v, ymm6.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm15.v, ymm7.v, ymm13.v);
+
+                // load Y vector
+                ymm10.v = _mm256_loadu_pd(y0);
+
+                // Permute and reduce the complex and real parts
+                ymm13.v = _mm256_permute_pd(ymm13.v, 5);
+                ymm8.v = _mm256_addsub_pd(ymm12.v, ymm13.v);
+
+                ymm12.v = _mm256_add_pd(ymm8.v, ymm10.v);
+
+                _mm256_storeu_pd((double *)(y0), ymm12.v);
+
+                // load first two columns of A
+                ymm8.v = _mm256_loadu_pd(a_ptr[0] + 4);  // 2 complex values from a0
+                ymm10.v = _mm256_loadu_pd(a_ptr[1] + 4); // 2 complex values from a0
+                // load 3rd and 4th columns of A
+                ymm14.v = _mm256_loadu_pd(a_ptr[2] + 4);
+                ymm15.v = _mm256_loadu_pd(a_ptr[3] + 4);
+
+                ymm12.v = _mm256_mul_pd(ymm8.v, ymm0.v);
+                ymm13.v = _mm256_mul_pd(ymm8.v, ymm1.v);
+
+                ymm12.v = _mm256_fmadd_pd(ymm10.v, ymm2.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm10.v, ymm3.v, ymm13.v);
+
+                _mm_prefetch(a_ptr[0] + distance * 2, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[1] + distance * 2, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[2] + distance * 2, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[3] + distance * 2, _MM_HINT_T1);
+
+                ymm12.v = _mm256_fmadd_pd(ymm14.v, ymm4.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm14.v, ymm5.v, ymm13.v);
+
+                _mm_prefetch(y0 + distance * 2, _MM_HINT_T1);
+
+                ymm12.v = _mm256_fmadd_pd(ymm15.v, ymm6.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm15.v, ymm7.v, ymm13.v);
+
+                // load Y vector
+                ymm10.v = _mm256_loadu_pd(y0 + 4);
+
+                ymm13.v = _mm256_permute_pd(ymm13.v, 5);
+                ymm8.v = _mm256_addsub_pd(ymm12.v, ymm13.v);
+
+                ymm12.v = _mm256_add_pd(ymm8.v, ymm10.v);
+
+                _mm256_storeu_pd((double *)(y0 + 4), ymm12.v);
+
+                y0 += 8;
+                a_ptr[0] += 8;
+                a_ptr[1] += 8;
+                a_ptr[2] += 8;
+                a_ptr[3] += 8;
+            }
+
+            for (; (i + 1) < m; i += 2)
             {
+                // load first two columns of A
+                ymm8.v = _mm256_loadu_pd(a_ptr[0]);  // 2 complex values from a0
+                ymm10.v = _mm256_loadu_pd(a_ptr[1]); // 2 complex values from a0
+                // load 3rd and 4th columns of A
+                ymm14.v = _mm256_loadu_pd(a_ptr[2]);
+                ymm15.v = _mm256_loadu_pd(a_ptr[3]);
+
+                ymm12.v = _mm256_mul_pd(ymm8.v, ymm0.v);
+                ymm13.v = _mm256_mul_pd(ymm8.v, ymm1.v);
+
+                ymm12.v = _mm256_fmadd_pd(ymm10.v, ymm2.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm10.v, ymm3.v, ymm13.v);
+
+                ymm12.v = _mm256_fmadd_pd(ymm14.v, ymm4.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm14.v, ymm5.v, ymm13.v);
+
+                ymm12.v = _mm256_fmadd_pd(ymm15.v, ymm6.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm15.v, ymm7.v, ymm13.v);
+
+                // load Y vector
+                ymm10.v = _mm256_loadu_pd(y0);
+
                 ymm13.v = _mm256_permute_pd(ymm13.v, 5);
                 ymm8.v = _mm256_addsub_pd(ymm12.v, ymm13.v);
+
+                ymm12.v = _mm256_add_pd(ymm8.v, ymm10.v);
+
+                _mm256_storeu_pd((double *)(y0), ymm12.v);
+
+                y0 += 4;
+                a_ptr[0] += 4;
+                a_ptr[1] += 4;
+                a_ptr[2] += 4;
+                a_ptr[3] += 4;
             }
-            else
+        }
+        else
+        {
+
+            for (; (i + 3) < m; i += 4)
             {
+                // load first two columns of A
+                ymm8.v = _mm256_loadu_pd(a_ptr[0]);  // 2 complex values from a0
+                ymm10.v = _mm256_loadu_pd(a_ptr[1]); // 2 complex values from a0
+                // load 3rd and 4th columns of A
+                ymm14.v = _mm256_loadu_pd(a_ptr[2]);
+                ymm15.v = _mm256_loadu_pd(a_ptr[3]);
+
+                ymm12.v = _mm256_mul_pd(ymm8.v, ymm0.v);
+                ymm13.v = _mm256_mul_pd(ymm8.v, ymm1.v);
+
+                ymm12.v = _mm256_fmadd_pd(ymm10.v, ymm2.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm10.v, ymm3.v, ymm13.v);
+
+                _mm_prefetch(a_ptr[0] + distance, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[1] + distance, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[2] + distance, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[3] + distance, _MM_HINT_T1);
+
+                ymm12.v = _mm256_fmadd_pd(ymm14.v, ymm4.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm14.v, ymm5.v, ymm13.v);
+
+                _mm_prefetch(y0 + distance, _MM_HINT_T1);
+
+                ymm12.v = _mm256_fmadd_pd(ymm15.v, ymm6.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm15.v, ymm7.v, ymm13.v);
+
+                // load Y vector
+                ymm10.v = _mm256_loadu_pd(y0);
+
+                ymm12.v = _mm256_permute_pd(ymm12.v, 5);
+                ymm8.v = _mm256_addsub_pd(ymm13.v, ymm12.v);
+                ymm8.v = _mm256_permute_pd(ymm8.v, 5);
+
+                ymm12.v = _mm256_add_pd(ymm8.v, ymm10.v);
+
+                _mm256_storeu_pd((double *)(y0), ymm12.v);
+
+                ymm8.v = _mm256_loadu_pd(a_ptr[0] + 4);  // 2 complex values from a0
+                ymm10.v = _mm256_loadu_pd(a_ptr[1] + 4); // 2 complex values from a0
+                // load 3rd and 4th columns of A
+                ymm14.v = _mm256_loadu_pd(a_ptr[2] + 4);
+                ymm15.v = _mm256_loadu_pd(a_ptr[3] + 4);
+
+                ymm12.v = _mm256_mul_pd(ymm8.v, ymm0.v);
+                ymm13.v = _mm256_mul_pd(ymm8.v, ymm1.v);
+
+                ymm12.v = _mm256_fmadd_pd(ymm10.v, ymm2.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm10.v, ymm3.v, ymm13.v);
+
+                _mm_prefetch(a_ptr[0] + distance * 2, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[1] + distance * 2, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[2] + distance * 2, _MM_HINT_T1);
+                _mm_prefetch(a_ptr[3] + distance * 2, _MM_HINT_T1);
+
+                ymm12.v = _mm256_fmadd_pd(ymm14.v, ymm4.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm14.v, ymm5.v, ymm13.v);
+
+                _mm_prefetch(y0 + distance * 2, _MM_HINT_T1);
+
+                ymm12.v = _mm256_fmadd_pd(ymm15.v, ymm6.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm15.v, ymm7.v, ymm13.v);
+
+                // load Y vector
+                ymm10.v = _mm256_loadu_pd(y0 + 4);
+
                 ymm12.v = _mm256_permute_pd(ymm12.v, 5);
                 ymm8.v = _mm256_addsub_pd(ymm13.v, ymm12.v);
                 ymm8.v = _mm256_permute_pd(ymm8.v, 5);
+
+                ymm12.v = _mm256_add_pd(ymm8.v, ymm10.v);
+
+                _mm256_storeu_pd((double *)(y0 + 4), ymm12.v);
+
+                y0 += 8;
+                a_ptr[0] += 8;
+                a_ptr[1] += 8;
+                a_ptr[2] += 8;
+                a_ptr[3] += 8;
             }
 
-            ymm12.v = _mm256_add_pd(ymm8.v, ymm10.v);
+            for (; (i + 1) < m; i += 2)
+            {
+                // load first two columns of A
+                ymm8.v = _mm256_loadu_pd(a_ptr[0]);  // 2 complex values from a0
+                ymm10.v = _mm256_loadu_pd(a_ptr[1]); // 2 complex values from a0
+                // load 3rd and 4th columns of A
+                ymm14.v = _mm256_loadu_pd(a_ptr[2]);
+                ymm15.v = _mm256_loadu_pd(a_ptr[3]);
 
-            _mm256_storeu_pd((double*)(y0), ymm12.v);
+                ymm12.v = _mm256_mul_pd(ymm8.v, ymm0.v);
+                ymm13.v = _mm256_mul_pd(ymm8.v, ymm1.v);
 
-            y0 += 4;
-            ap[0] += 4;
-            ap[1] += 4;
-            ap[2] += 4;
-            ap[3] += 4;
-        }
+                ymm12.v = _mm256_fmadd_pd(ymm10.v, ymm2.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm10.v, ymm3.v, ymm13.v);
 
-        // If there are leftover iterations, perform them with scalar code.
+                //_mm_prefetch(y0, _MM_HINT_T1);
 
-        for ( i = 0; (i + 0) < n2 ; ++i )
-        {
-            dcomplex       y0c = *(dcomplex*)y0;
+                ymm12.v = _mm256_fmadd_pd(ymm14.v, ymm4.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm14.v, ymm5.v, ymm13.v);
 
-            const dcomplex a0c = *(dcomplex*)ap[0];
-            const dcomplex a1c = *(dcomplex*)ap[1];
-            const dcomplex a2c = *(dcomplex*)ap[2];
-            const dcomplex a3c = *(dcomplex*)ap[3];
+                ymm12.v = _mm256_fmadd_pd(ymm15.v, ymm6.v, ymm12.v);
+                ymm13.v = _mm256_fmadd_pd(ymm15.v, ymm7.v, ymm13.v);
 
-            y0c.real += chi0.real * a0c.real - chi0.imag * a0c.imag * setPlusOne;
-            y0c.real += chi1.real * a1c.real - chi1.imag * a1c.imag * setPlusOne;
-            y0c.real += chi2.real * a2c.real - chi2.imag * a2c.imag * setPlusOne;
-            y0c.real += chi3.real * a3c.real - chi3.imag * a3c.imag * setPlusOne;
+                // load Y vector
+                ymm10.v = _mm256_loadu_pd(y0);
 
-            y0c.imag += chi0.imag * a0c.real + chi0.real * a0c.imag * setPlusOne;
-            y0c.imag += chi1.imag * a1c.real + chi1.real * a1c.imag * setPlusOne;
-            y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne;
-            y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne;
+                ymm12.v = _mm256_permute_pd(ymm12.v, 5);
+                ymm8.v = _mm256_addsub_pd(ymm13.v, ymm12.v);
+                ymm8.v = _mm256_permute_pd(ymm8.v, 5);
 
-            *(dcomplex*)y0 = y0c;
+                ymm12.v = _mm256_add_pd(ymm8.v, ymm10.v);
 
-            ap[0] += 2;
-            ap[1] += 2;
-            ap[2] += 2;
-            ap[3] += 2;
-            y0 += 2;
+                _mm256_storeu_pd((double *)(y0), ymm12.v);
+
+                y0 += 4;
+                a_ptr[0] += 4;
+                a_ptr[1] += 4;
+                a_ptr[2] += 4;
+                a_ptr[3] += 4;
+            }
         }
-    //PASTEMAC(c,fprintm)(stdout, "Y after A*x in axpyf",m, 1, (scomplex*)y, 1, 1, "%4.1f", "");
+    }
+
+    // Issue vzeroupper instruction to clear upper lanes of ymm registers.
+    // This avoids a performance penalty caused by false dependencies when
+    // transitioning from AVX to SSE instructions (which may occur later,
+    // especially if BLIS is compiled with -mfpmath=sse).
+    _mm256_zeroupper();
+
+    __m128d a_vec[4], y_vec, inter[2];
+
+    // broadcast real & imag parts of 4 elements of x
+    x_vec[0] = _mm_set1_pd(chi0.real); // real part of x0
+    x_vec[1] = _mm_set1_pd(chi0.imag); // imag part of x0
+    x_vec[2] = _mm_set1_pd(chi1.real); // real part of x1
+    x_vec[3] = _mm_set1_pd(chi1.imag); // imag part of x1
+    x_vec[4] = _mm_set1_pd(chi2.real); // real part of x2
+    x_vec[5] = _mm_set1_pd(chi2.imag); // imag part of x2
+    x_vec[6] = _mm_set1_pd(chi3.real); // real part of x3
+    x_vec[7] = _mm_set1_pd(chi3.imag); // imag part of x3
+
+    if (bli_is_noconj(conja))
+    {
+        for (; i < m; i++)
+        {
+            // load first two columns of A
+            a_vec[0] = _mm_loadu_pd(a_ptr[0]); // 2 complex values from a0
+            a_vec[1] = _mm_loadu_pd(a_ptr[1]); // 2 complex values from a0
+            a_vec[2] = _mm_loadu_pd(a_ptr[2]); // 2 complex values from a0
+            a_vec[3] = _mm_loadu_pd(a_ptr[3]); // 2 complex values from a0
+
+            inter[0] = _mm_mul_pd(a_vec[0], x_vec[0]);
+            inter[1] = _mm_mul_pd(a_vec[0], x_vec[1]);
+
+            inter[0] = _mm_fmadd_pd(a_vec[1], x_vec[2], inter[0]);
+            inter[1] = _mm_fmadd_pd(a_vec[1], x_vec[3], inter[1]);
+
+            //_mm_prefetch(y0, _MM_HINT_T1);
+
+            inter[0] = _mm_fmadd_pd(a_vec[2], x_vec[4], inter[0]);
+            inter[1] = _mm_fmadd_pd(a_vec[2], x_vec[5], inter[1]);
+
+            inter[0] = _mm_fmadd_pd(a_vec[3], x_vec[6], inter[0]);
+            inter[1] = _mm_fmadd_pd(a_vec[3], x_vec[7], inter[1]);
+
+            inter[1] = _mm_permute_pd(inter[1], 0b01);
+            inter[0] = _mm_addsub_pd(inter[0], inter[1]);
+
+            // load Y vector
+            y_vec = _mm_loadu_pd(y0);
+
+            y_vec = _mm_add_pd(y_vec, inter[0]);
 
+            _mm_storeu_pd((double *)(y0), y_vec);
+
+            y0 += 2 * incy;
+            a_ptr[0] += 2 * inca;
+            a_ptr[1] += 2 * inca;
+            a_ptr[2] += 2 * inca;
+            a_ptr[3] += 2 * inca;
+        }
     }
     else
     {
-        for (i = 0 ; (i + 0) < m ; ++i )
+        for (; i < m; i++)
         {
-            dcomplex       y0c = *(dcomplex*)y0;
-            const dcomplex a0c = *(dcomplex*)ap[0];
-            const dcomplex a1c = *(dcomplex*)ap[1];
-            const dcomplex a2c = *(dcomplex*)ap[2];
-            const dcomplex a3c = *(dcomplex*)ap[3];
+            // load first two columns of A
+            a_vec[0] = _mm_loadu_pd(a_ptr[0]); // 2 complex values from a0
+            a_vec[1] = _mm_loadu_pd(a_ptr[1]); // 2 complex values from a0
+                                               // load 3rd and 4th columns of A
+            a_vec[2] = _mm_loadu_pd(a_ptr[2]); // 2 complex values from a0
+            a_vec[3] = _mm_loadu_pd(a_ptr[3]); // 2 complex values from a0
 
-            y0c.real += chi0.real * a0c.real - chi0.imag * a0c.imag * setPlusOne;
-            y0c.real += chi1.real * a1c.real - chi1.imag * a1c.imag * setPlusOne;
-            y0c.real += chi2.real * a2c.real - chi2.imag * a2c.imag * setPlusOne;
-            y0c.real += chi3.real * a3c.real - chi3.imag * a3c.imag * setPlusOne;
+            inter[0] = _mm_mul_pd(a_vec[0], x_vec[0]);
+            inter[1] = _mm_mul_pd(a_vec[0], x_vec[1]);
 
-            y0c.imag += chi0.imag * a0c.real + chi0.real * a0c.imag * setPlusOne;
-            y0c.imag += chi1.imag * a1c.real + chi1.real * a1c.imag * setPlusOne;
-            y0c.imag += chi2.imag * a2c.real + chi2.real * a2c.imag * setPlusOne;
-            y0c.imag += chi3.imag * a3c.real + chi3.real * a3c.imag * setPlusOne;
+            inter[0] = _mm_fmadd_pd(a_vec[1], x_vec[2], inter[0]);
+            inter[1] = _mm_fmadd_pd(a_vec[1], x_vec[3], inter[1]);
 
-            *(dcomplex*)y0 = y0c;
+            // load Y vector
+            y_vec = _mm_loadu_pd(y0);
 
-            ap[0] += inca;
-            ap[1] += inca;
-            ap[2] += inca;
-            ap[3] += inca;
-            y0 += incy;
+            inter[0] = _mm_fmadd_pd(a_vec[2], x_vec[4], inter[0]);
+            inter[1] = _mm_fmadd_pd(a_vec[2], x_vec[5], inter[1]);
+
+            inter[0] = _mm_fmadd_pd(a_vec[3], x_vec[6], inter[0]);
+            inter[1] = _mm_fmadd_pd(a_vec[3], x_vec[7], inter[1]);
+
+            inter[0] = _mm_permute_pd(inter[0], 0b01);
+            inter[0] = _mm_addsub_pd(inter[1], inter[0]);
+            inter[0] = _mm_permute_pd(inter[0], 0b01);
+
+            y_vec = _mm_add_pd(y_vec, inter[0]);
+
+            _mm_storeu_pd((double *)(y0), y_vec);
+
+            y0 += 2 * incy;
+            a_ptr[0] += 2 * inca;
+            a_ptr[1] += 2 * inca;
+            a_ptr[2] += 2 * inca;
+            a_ptr[3] += 2 * inca;
         }
     }
+
+    // vzeroupper is added by the compiler at the end of the kernel
 }
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_6.c b/kernels/zen/1f/bli_axpyf_zen_int_6.c
index cf7dbd1732..6da5d99e6d 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_6.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_6.c
@@ -1,235 +1,235 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "immintrin.h"
-#include "blis.h"
-
-/* Union data structure to access AVX registers
-   One 256-bit AVX register holds 8 SP elements. */
-typedef union
-{
-    __m256  v;
-    float   f[8] __attribute__((aligned(64)));
-} v8sf_t;
-
-/* Union data structure to access AVX registers
-*  One 256-bit AVX register holds 4 DP elements. */
-typedef union
-{
-    __m256d v;
-    __m128d xmm[2];
-    double  d[4] __attribute__((aligned(64)));
-} v4df_t;
-
-typedef union
-{
-    __m128d v;
-    double  d[2] __attribute__((aligned(64)));
-} v2df_t;
-
-
-void bli_saxpyf_zen_int_6
-     (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       float* restrict alpha,
-       float* restrict a, inc_t inca, inc_t lda,
-       float* restrict x, inc_t incx,
-       float* restrict y, inc_t incy,
-       cntx_t* restrict cntx
-     )
-{
-    const dim_t      fuse_fac       = 6;
-    const dim_t      n_elem_per_reg = 8;
-
-    dim_t            i;
-
-    float* restrict a0;
-    float* restrict y0;
-
-    v8sf_t           chi0v, chi1v, chi2v, chi3v;
-    v8sf_t           chi4v,chi5v;
-
-    v8sf_t           a00v, a01v;
-
-    v8sf_t           y0v;
-
-    float           chi0, chi1, chi2, chi3;
-    float           chi4,chi5;
-
-    // If either dimension is zero, or if alpha is zero, return early.
-    if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
-
-    // If b_n is not equal to the fusing factor, then perform the entire
-    // operation as a loop over axpyv.
-    if ( b_n != fuse_fac )
-    {
-        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
-
-        for ( i = 0; i < b_n; ++i )
-        {
-            float* a1   = a + (0  )*inca + (i  )*lda;
-            float* chi1 = x + (i  )*incx;
-            float* y1   = y + (0  )*incy;
-            float  alpha_chi1;
-
-            bli_scopycjs( conjx, *chi1, alpha_chi1 );
-            bli_sscals( *alpha, alpha_chi1 );
-
-            f
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-        return;
-    }
-
-    // At this point, we know that b_n is exactly equal to the fusing factor.
-    a0   = a + 0*lda;
-    y0   = y;
-
-    // Scale each chi scalar by alpha.
-    chi0 = *( x + 0*incx )*(*alpha);
-    chi1 = *( x + 1*incx )*(*alpha);
-    chi2 = *( x + 2*incx )*(*alpha);
-    chi3 = *( x + 3*incx )*(*alpha);
-    chi4 = *( x + 4*incx )*(*alpha);
-    chi5 = *( x + 5*incx )*(*alpha);
-
-    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
-    chi0v.v = _mm256_broadcast_ss( &chi0 );
-    chi1v.v = _mm256_broadcast_ss( &chi1 );
-    chi2v.v = _mm256_broadcast_ss( &chi2 );
-    chi3v.v = _mm256_broadcast_ss( &chi3 );
-    chi4v.v = _mm256_broadcast_ss( &chi4 );
-    chi5v.v = _mm256_broadcast_ss( &chi5 );
-
-    // If there are vectorized iterations, perform them with vector
-    // instructions.
-    if ( inca == 1 && incy == 1 )
-    {
-        for( i=0; (i + 7) < m; i += 8 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-
-            //Col_0
-            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );  // perform : y += alpha * x;
-
-            //Col_1
-            a01v.v = _mm256_loadu_ps( a0 + 1*lda );
-            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
-
-            //Col_2
-            a00v.v = _mm256_loadu_ps( a0 + 2*lda );
-            y0v.v = _mm256_fmadd_ps( a00v.v, chi2v.v, y0v.v );
-
-            //Col_3
-            a01v.v = _mm256_loadu_ps( a0 + 3*lda );
-            y0v.v = _mm256_fmadd_ps( a01v.v, chi3v.v, y0v.v );
-
-            //Col_4
-            a00v.v = _mm256_loadu_ps( a0 + 4*lda );
-            y0v.v = _mm256_fmadd_ps( a00v.v, chi4v.v, y0v.v );
-
-            //Col_5
-            a01v.v = _mm256_loadu_ps( a0 + 5*lda );
-            y0v.v = _mm256_fmadd_ps( a01v.v, chi5v.v, y0v.v );
-
-            // Store the output.
-            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-        }
-        // If there are leftover iterations, perform them with scalar code.
-        for ( ; (i + 0) < m ; ++i )
-        {
-            float       y0c = *y0;
-
-            const float a0c = *a0;
-            const float a1c = *(a0+ 1*lda);
-            const float a2c = *(a0+ 2*lda);
-            const float a3c = *(a0+ 3*lda);
-            const float a4c = *(a0+ 4*lda);
-            const float a5c = *(a0+ 5*lda);
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-            y0c += chi5 * a5c;
-
-            *y0 = y0c;
-
-            a0 += 1;
-            y0 += 1;
-        }
-    }
-    else
-    {
-        for ( i = 0; (i + 0) < m ; ++i )
-        {
-            float       y0c = *y0;
-            const float a0c = *a0;
-            const float a1c = *(a0+ 1*lda);
-            const float a2c = *(a0+ 2*lda);
-            const float a3c = *(a0+ 3*lda);
-            const float a4c = *(a0+ 4*lda);
-            const float a5c = *(a0+ 5*lda);
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-            y0c += chi5 * a5c;
-
-            *y0 = y0c;
-
-            a0 += inca;
-            y0 += incy;
-        }
-    }
-}
\ No newline at end of file
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+    __m256d v;
+    __m128d xmm[2];
+    double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+typedef union
+{
+    __m128d v;
+    double  d[2] __attribute__((aligned(64)));
+} v2df_t;
+
+
+void bli_saxpyf_zen_int_6
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       float* restrict alpha,
+       float* restrict a, inc_t inca, inc_t lda,
+       float* restrict x, inc_t incx,
+       float* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 6;
+    const dim_t      n_elem_per_reg = 8;
+
+    dim_t            i;
+
+    float* restrict a0;
+    float* restrict y0;
+
+    v8sf_t           chi0v, chi1v, chi2v, chi3v;
+    v8sf_t           chi4v,chi5v;
+
+    v8sf_t           a00v, a01v;
+
+    v8sf_t           y0v;
+
+    float           chi0, chi1, chi2, chi3;
+    float           chi4,chi5;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            float* a1   = a + (0  )*inca + (i  )*lda;
+            float* chi1 = x + (i  )*incx;
+            float* y1   = y + (0  )*incy;
+            float  alpha_chi1;
+
+            bli_scopycjs( conjx, *chi1, alpha_chi1 );
+            bli_sscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+    a0   = a + 0*lda;
+    y0   = y;
+
+    // Scale each chi scalar by alpha.
+    chi0 = *( x + 0*incx )*(*alpha);
+    chi1 = *( x + 1*incx )*(*alpha);
+    chi2 = *( x + 2*incx )*(*alpha);
+    chi3 = *( x + 3*incx )*(*alpha);
+    chi4 = *( x + 4*incx )*(*alpha);
+    chi5 = *( x + 5*incx )*(*alpha);
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_ss( &chi0 );
+    chi1v.v = _mm256_broadcast_ss( &chi1 );
+    chi2v.v = _mm256_broadcast_ss( &chi2 );
+    chi3v.v = _mm256_broadcast_ss( &chi3 );
+    chi4v.v = _mm256_broadcast_ss( &chi4 );
+    chi5v.v = _mm256_broadcast_ss( &chi5 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for( i=0; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+            //Col_0
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );  // perform : y += alpha * x;
+
+            //Col_1
+            a01v.v = _mm256_loadu_ps( a0 + 1*lda );
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+
+            //Col_2
+            a00v.v = _mm256_loadu_ps( a0 + 2*lda );
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi2v.v, y0v.v );
+
+            //Col_3
+            a01v.v = _mm256_loadu_ps( a0 + 3*lda );
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi3v.v, y0v.v );
+
+            //Col_4
+            a00v.v = _mm256_loadu_ps( a0 + 4*lda );
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi4v.v, y0v.v );
+
+            //Col_5
+            a01v.v = _mm256_loadu_ps( a0 + 5*lda );
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi5v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+        }
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            float       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *(a0+ 1*lda);
+            const float a2c = *(a0+ 2*lda);
+            const float a3c = *(a0+ 3*lda);
+            const float a4c = *(a0+ 4*lda);
+            const float a5c = *(a0+ 5*lda);
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+            y0c += chi5 * a5c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            float       y0c = *y0;
+            const float a0c = *a0;
+            const float a1c = *(a0+ 1*lda);
+            const float a2c = *(a0+ 2*lda);
+            const float a3c = *(a0+ 3*lda);
+            const float a4c = *(a0+ 4*lda);
+            const float a5c = *(a0+ 5*lda);
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+            y0c += chi5 * a5c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            y0 += incy;
+        }
+    }
+}
diff --git a/kernels/zen/1f/bli_dotxaxpyf_zen_int_8.c b/kernels/zen/1f/bli_dotxaxpyf_zen_int_8.c
index 1be9975ecf..ba92d493ea 100644
--- a/kernels/zen/1f/bli_dotxaxpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxaxpyf_zen_int_8.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2021-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -473,7 +473,7 @@ void bli_ddotxaxpyf_zen_int_8
 	/* A is m x n.                  */
 	/* y = beta * y + alpha * A^T w; */
 	/* z =        z + alpha * A   x; */
-	if ( ( bli_cpuid_is_avx_supported() == TRUE ) && 
+	if ( ( bli_cpuid_is_avx2fma3_supported() == TRUE ) &&
 	     (inca == 1) && (incw == 1) && (incx == 1)
 	     && (incy == 1) && (incz == 1) && (b_n == 8) )
 	{
@@ -775,7 +775,7 @@ void bli_zdotxaxpyf_zen_int_8
 	//
 	// y = beta * y + alpha * A^T w;
 	// z =        z + alpha * A   x;
-	if ( ( bli_cpuid_is_avx_supported() == TRUE ) &&
+	if ( ( bli_cpuid_is_avx2fma3_supported() == TRUE ) &&
 	     ( inca == 1 ) && ( incw == 1 ) && ( incx == 1 )
 	     && ( incy == 1 ) && ( incz == 1 ) && ( b_n == 4 ) )
 	{
@@ -1182,7 +1182,7 @@ void bli_cdotxaxpyf_zen_int_8
 	//
 	// y = beta * y + alpha * A^T w;
 	// z =        z + alpha * A   x;
-	if ( ( bli_cpuid_is_avx_supported() == TRUE ) &&
+	if ( ( bli_cpuid_is_avx2fma3_supported() == TRUE ) &&
 	     ( inca == 1 ) && ( incw == 1 ) && ( incx == 1 )
 	     && ( incy == 1 ) && ( incz == 1 ) && ( b_n == 4 ) )
 	{
@@ -1558,4 +1558,4 @@ void bli_cdotxaxpyf_zen_int_8
 			 cntx
 			);
 	}
-}
\ No newline at end of file
+}
diff --git a/kernels/zen/2/CMakeLists.txt b/kernels/zen/2/CMakeLists.txt
index 85ad4bfd5a..c9c9220609 100644
--- a/kernels/zen/2/CMakeLists.txt
+++ b/kernels/zen/2/CMakeLists.txt
@@ -1,14 +1,19 @@
 ##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}"
-     PRIVATE
+add_library(zen_2
+     OBJECT
 	 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_ref.c
 	 ${CMAKE_CURRENT_SOURCE_DIR}/bli_her2_zen_int_4.c
 	 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemv_zen_int_4.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_zen_int_amd.c
      )
-
+target_compile_options(zen_2 PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen_2 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
+# For any other TARGET_ARCH, it would fail to configure.
 # Select AMD specific sources for AMD configurations.
-if(${TARGET_ARCH} STREQUAL zen OR 
+#[=[if(${TARGET_ARCH} STREQUAL zen OR 
 ${TARGET_ARCH} STREQUAL zen2 OR 
 ${TARGET_ARCH} STREQUAL zen3 OR 
 ${TARGET_ARCH} STREQUAL zen4 OR
@@ -17,4 +22,4 @@ ${TARGET_ARCH} STREQUAL amdzen)
      PRIVATE
      ${CMAKE_CURRENT_SOURCE_DIR}/bli_her_zen_int_amd.c
      )
-endif()
\ No newline at end of file
+endif()]=]
diff --git a/kernels/zen/2/bli_gemv_zen_int_4.c b/kernels/zen/2/bli_gemv_zen_int_4.c
index a4bdfb4499..6970a7f62a 100644
--- a/kernels/zen/2/bli_gemv_zen_int_4.c
+++ b/kernels/zen/2/bli_gemv_zen_int_4.c
@@ -565,7 +565,7 @@ void bli_multi_sgemv_4x2
     // Calculate the total number of multithreaded iteration
     total_iteration = b_n / b_fuse;
 
-#pragma omp parallel for num_threads(n_threads)
+    _Pragma( "omp parallel for num_threads(n_threads)" )
     for (dim_t j = 0; j < total_iteration; j++)
     {
         float *A1 = a + (b_fuse * j) * lda;
diff --git a/kernels/zen/2/bli_her_zen_int_amd.c b/kernels/zen/2/bli_her_zen_int_amd.c
index ee259b7e3e..be7bbab3bc 100644
--- a/kernels/zen/2/bli_her_zen_int_amd.c
+++ b/kernels/zen/2/bli_her_zen_int_amd.c
@@ -1125,4 +1125,4 @@ void bli_zher_zen_int_var2
             cc->imag += interI;
         }
     }
-}
\ No newline at end of file
+}
diff --git a/kernels/zen/3/CMakeLists.txt b/kernels/zen/3/CMakeLists.txt
index d90e4e3902..97a067bb64 100644
--- a/kernels/zen/3/CMakeLists.txt
+++ b/kernels/zen/3/CMakeLists.txt
@@ -1,11 +1,14 @@
-##Copyright (C) 2020-2022, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}"
-     PRIVATE
+add_library(zen_3
+     OBJECT
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemm_small.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_ref_k1.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_ref_k1.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_avx2_k1.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_avx2_k1.c
     )
-
+target_compile_options(zen_3 PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
 add_subdirectory(sup)
diff --git a/kernels/zen/3/bli_dgemm_ref_k1.c b/kernels/zen/3/bli_dgemm_avx2_k1.c
similarity index 99%
rename from kernels/zen/3/bli_dgemm_ref_k1.c
rename to kernels/zen/3/bli_dgemm_avx2_k1.c
index 03a2b789bb..b225fdad1a 100644
--- a/kernels/zen/3/bli_dgemm_ref_k1.c
+++ b/kernels/zen/3/bli_dgemm_avx2_k1.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -40,7 +40,7 @@
 #define D_MR  8
 #define D_NR  6
 
-void bli_dgemm_ref_k1_nn
+void bli_dgemm_8x6_avx2_k1_nn
      (
     dim_t  m,
     dim_t  n,
@@ -71,6 +71,12 @@ void bli_dgemm_ref_k1_nn
     __m256d ymm12, ymm13, ymm14, ymm15;
     __m128d xmm5;
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_pd();
+    ymm1 = _mm256_setzero_pd();
+    ymm2 = _mm256_setzero_pd();
+
     /* Form C = alpha*A*B + beta*c */
     for(dim_t j = 0;j < (n-D_NR+1);j=j+D_NR)
     {
diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c
index 22bb48f737..477c710471 100644
--- a/kernels/zen/3/bli_gemm_small.c
+++ b/kernels/zen/3/bli_gemm_small.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2017-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2017-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -61,36 +61,6 @@ static err_t bli_sgemm_small
        cntl_t* cntl
      );
 
-err_t bli_dgemm_small
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
-     );
-err_t bli_zgemm_small
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
-     );
-err_t bli_zgemm_small_At
-     (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
-     );
 static err_t bli_sgemm_small_atbn
      (
        obj_t*  alpha,
@@ -134,9 +104,9 @@ err_t bli_gemm_small
     AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
     return BLIS_NOT_YET_IMPLEMENTED;
 #else
-    // This function is invoked on all architectures including ‘generic’.
-    // Non-AVX platforms will use the kernels derived from the context.
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    // This function is invoked on all architectures including 'generic'.
+    // Non-AVX2+FMA3 platforms will use the kernels derived from the context.
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         return BLIS_NOT_YET_IMPLEMENTED;
     }
@@ -165,25 +135,25 @@ err_t bli_gemm_small
 #ifndef BLIS_ENABLE_MULTITHREADING
             // bli_dgemm_small_At is called directly from blas interface for
             // sizes within thresholds.
-            // Avoinding calling of bli_dgemm_small_At from gemm_front
+            // Avoiding calling of bli_dgemm_small_At from gemm_front
             // and directing to native implementation.
             return BLIS_NOT_YET_IMPLEMENTED;
 #else
             return bli_dgemm_small_At(alpha, a, b, beta, c, cntx, cntl);
 #endif
         }
-    if(dt == BLIS_DCOMPLEX)
-    {
+        if(dt == BLIS_DCOMPLEX)
+        {
 #ifndef BLIS_ENABLE_MULTITHREADING
             // bli_zgemm_small_At is called directly from blas interface for
             // sizes within thresholds.
-            // Avoinding calling of bli_zgemm_small_At from gemm_front
+            // Avoiding calling of bli_zgemm_small_At from gemm_front
             // and directing to native implementation.
             return BLIS_NOT_YET_IMPLEMENTED;
 #else
-        return bli_zgemm_small_At(alpha, a, b, beta, c, cntx, cntl);
+            return bli_zgemm_small_At(alpha, a, b, beta, c, cntx, cntl);
 #endif
-    }
+        }
 
         if (bli_obj_has_notrans( b ))
         {
@@ -390,8 +360,8 @@ static err_t bli_sgemm_small
 
             // This is the part of the pack and compute optimization.
             // During the first column iteration, we store the accessed A matrix into
-            // contiguous static memory. This helps to keep te A matrix in Cache and
-            // aviods the TLB misses.
+            // contiguous static memory. This helps to keep the A matrix in Cache and
+            // avoids the TLB misses.
             if (required_packing_A)
             {
                 col_idx = 0;
@@ -1778,14 +1748,14 @@ static err_t bli_sgemm_small
     {
         AOCL_DTL_TRACE_EXIT_ERR(
             AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for small gemm."
+            "Invalid dimensions for small gemm."
             );
         return BLIS_NONCONFORMAL_DIMENSIONS;
     }
 
 };
 
-/*static*/ err_t bli_dgemm_small
+err_t bli_dgemm_small
      (
        obj_t*  alpha,
        obj_t*  a,
@@ -1797,7 +1767,7 @@ static err_t bli_sgemm_small
      )
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         return BLIS_NOT_YET_IMPLEMENTED;
     }
@@ -1806,7 +1776,7 @@ static err_t bli_sgemm_small
     gint_t K = bli_obj_width( a );  // number of columns of OP(A), will be updated if OP(A) is Transpose(A) .
     gint_t L = M * N;
 
-    /* if (N<3) //Implemenation assumes that N is atleast 3. VK */
+    /* if (N<3) //Implementation assumes that N is at least 3. VK */
     /*  { */
     /*      AOCL_DTL_TRACE_EXIT_ERR( */
     /*          AOCL_DTL_LEVEL_INFO, */
@@ -1827,7 +1797,7 @@ static err_t bli_sgemm_small
         double *C = bli_obj_buffer_at_off(c); // pointer to elements of Matrix C
 
         double *tA = A, *tB = B, *tC = C;//, *tA_pack;
-        double *tA_packed; // temprorary pointer to hold packed A memory pointer
+        double *tA_packed; // temporary pointer to hold packed A memory pointer
         guint_t row_idx_packed; //packed A memory row index
         guint_t lda_packed; //lda of packed A
         guint_t col_idx_start; //starting index after A matrix is packed.
@@ -1938,8 +1908,8 @@ static err_t bli_sgemm_small
 
             // This is the part of the pack and compute optimization.
             // During the first column iteration, we store the accessed A matrix into
-            // contiguous static memory. This helps to keep te A matrix in Cache and
-            // aviods the TLB misses.
+            // contiguous static memory. This helps to keep the A matrix in Cache and
+            // avoids the TLB misses.
             if (required_packing_A)
             {
                 col_idx = 0;
@@ -3369,7 +3339,7 @@ static err_t bli_sgemm_small
     {
         AOCL_DTL_TRACE_EXIT_ERR(
             AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for small gemm."
+            "Invalid dimensions for small gemm."
             );
         return BLIS_NONCONFORMAL_DIMENSIONS;
     }
@@ -3846,7 +3816,7 @@ static err_t bli_sgemm_small_atbn
     {
         AOCL_DTL_TRACE_EXIT_ERR(
             AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for small gemm."
+            "Invalid dimensions for small gemm."
             );
         return BLIS_NONCONFORMAL_DIMENSIONS;
     }
@@ -4286,7 +4256,7 @@ static err_t bli_dgemm_small_atbn
     {
         AOCL_DTL_TRACE_EXIT_ERR(
             AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for small gemm."
+            "Invalid dimensions for small gemm."
             );
         return BLIS_NONCONFORMAL_DIMENSIONS;
     }
@@ -4305,7 +4275,7 @@ err_t bli_dgemm_small_At
 {
 
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         return BLIS_NOT_YET_IMPLEMENTED;
     }
@@ -4314,7 +4284,7 @@ err_t bli_dgemm_small_At
     gint_t K = bli_obj_width_after_trans( a );  // number of columns of OP(A), will be updated if OP(A) is Transpose(A) .
 
 
-    if (N<3) //Implemenation assumes that N is atleast 3.
+    if (N<3) //Implementation assumes that N is at least 3.
     {
         AOCL_DTL_TRACE_EXIT_ERR(
             AOCL_DTL_LEVEL_INFO,
@@ -4341,7 +4311,7 @@ err_t bli_dgemm_small_At
         double *C = bli_obj_buffer_at_off(c); // pointer to elements of Matrix C
 
         double *tA = A, *tB = B, *tC = C;//, *tA_pack;
-        double *tA_packed; // temprorary pointer to hold packed A memory pointer
+        double *tA_packed; // temporary pointer to hold packed A memory pointer
         guint_t row_idx_packed; //packed A memory row index
         guint_t lda_packed; //lda of packed A
         dim_t tb_inc_row = 1; // row stride of matrix B
@@ -5748,7 +5718,7 @@ err_t bli_dgemm_small_At
     {
         AOCL_DTL_TRACE_EXIT_ERR(
             AOCL_DTL_LEVEL_INFO,
-            "Invalid dimesions for dgemm_small_At."
+            "Invalid dimensions for dgemm_small_At."
             );
         return BLIS_NONCONFORMAL_DIMENSIONS;
     }
@@ -5798,7 +5768,7 @@ err_t bli_zgemm_small
      )
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         return BLIS_NOT_YET_IMPLEMENTED;
     }
@@ -5822,7 +5792,7 @@ err_t bli_zgemm_small
         dcomplex *C = bli_obj_buffer_at_off(c); //pointer to elements of Matrix C
 
         dcomplex *tA = A, *tB = B, *tC = C;//, *tA_pack;
-        dcomplex *tA_packed; //temprorary pointer to hold packed A memory pointer
+        dcomplex *tA_packed; //temporary pointer to hold packed A memory pointer
         guint_t row_idx_packed; //packed A memory row index
         guint_t lda_packed; //lda of packed A
         guint_t col_idx_start; //starting index after A matrix is packed.
@@ -5834,6 +5804,10 @@ err_t bli_zgemm_small
         __m256d ymm16, ymm17, ymm18, ymm19, ymm20, ymm21;
         __m256d ymm0, ymm1, ymm2, ymm3;
 
+        //gcc12 throws a unitialized warning,
+        //To avoid that these variable are se to zero.
+        ymm0 = _mm256_setzero_pd();
+
         gint_t n_remainder; // If the N is non multiple of 3.(N%3)
         gint_t m_remainder; // If the M is non multiple of 4.(M%4)
 
@@ -5933,8 +5907,8 @@ err_t bli_zgemm_small
             /**
              * This is the part of the pack and compute optimization.
              * During the first column iteration, we store the accessed A
-             * matrix into contiguous static memory. This helps to keep te A
-             * matrix in Cache and aviods the TLB misses.
+             * matrix into contiguous static memory. This helps to keep the A
+             * matrix in Cache and avoids the TLB misses.
              */
             if (required_packing_A)
             {
@@ -9730,7 +9704,7 @@ err_t bli_zgemm_small
     {
         AOCL_DTL_TRACE_EXIT_ERR(
                 AOCL_DTL_LEVEL_INFO,
-                "Invalid dimesions for small gemm."
+                "Invalid dimensions for small gemm."
                 );
         return BLIS_NONCONFORMAL_DIMENSIONS;
     }
@@ -9748,7 +9722,7 @@ err_t bli_zgemm_small_At
      )
 {
     AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_INFO);
-    if (bli_cpuid_is_avx_supported() == FALSE)
+    if (bli_cpuid_is_avx2fma3_supported() == FALSE)
     {
         return BLIS_NOT_YET_IMPLEMENTED;
     }
@@ -9759,7 +9733,7 @@ err_t bli_zgemm_small_At
     gint_t N = bli_obj_width( c );  // number of columns of Matrix C
     gint_t K = bli_obj_width_after_trans( a );  // number of columns of OP(A)
 
-    if (N<3) //Implemenation assumes that N is atleast 3.
+    if (N<3) //Implementation assumes that N is at least 3.
     {
         AOCL_DTL_TRACE_EXIT_ERR(
                 AOCL_DTL_LEVEL_INFO,
@@ -9779,7 +9753,7 @@ err_t bli_zgemm_small_At
         dcomplex *C = bli_obj_buffer_at_off(c); //pointer to elements of Matrix C
 
         dcomplex *tA = A, *tB = B, *tC = C;//, *tA_pack;
-        dcomplex *tA_packed; // temprorary pointer to hold packed A memory pointer
+        dcomplex *tA_packed; // temporary pointer to hold packed A memory pointer
         guint_t row_idx_packed; //packed A memory row index
         guint_t lda_packed; //lda of packed A
         dim_t tb_inc_row = 1; // row stride of matrix B
@@ -9806,6 +9780,10 @@ err_t bli_zgemm_small_At
         __m256d ymm16, ymm17, ymm18, ymm19, ymm20, ymm21;
         __m256d ymm0, ymm1, ymm2, ymm3;
 
+        //gcc12 throws a unitialized warning,
+        //To avoid that these variable are set to zero.
+        ymm0 = _mm256_setzero_pd();
+
         gint_t n_remainder; // If the N is non multiple of 3.(N%3)
         gint_t m_remainder; // If the M is non multiple of 16.(M%16)
 
@@ -13428,7 +13406,7 @@ err_t bli_zgemm_small_At
     {
         AOCL_DTL_TRACE_EXIT_ERR(
                 AOCL_DTL_LEVEL_INFO,
-                "Invalid dimesions for dgemm_small_At."
+                "Invalid dimensions for dgemm_small_At."
                 );
         return BLIS_NONCONFORMAL_DIMENSIONS;
     }
diff --git a/kernels/zen/3/bli_trsm_small.c b/kernels/zen/3/bli_trsm_small.c
index f5f7f37c6f..d08dbb2279 100644
--- a/kernels/zen/3/bli_trsm_small.c
+++ b/kernels/zen/3/bli_trsm_small.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -35,6 +35,7 @@
 #include "blis.h"
 #ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
 #include "immintrin.h"
+#include "bli_trsm_small_ref.h"
 
 #define BLIS_ENABLE_PREFETCH_IN_TRSM_SMALL
 
@@ -107,18 +108,6 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
     cntl_t* cntl
 );
 
-//AX = B; A is lower triangular; transpose;
-//double precision; non-unit diagonal
-BLIS_INLINE err_t dtrsm_AltXB_ref
-(
-    double *A,
-    double *B,
-    dim_t M,
-    dim_t N,
-    dim_t lda,
-    dim_t ldb,
-    bool is_unitdiag
-);
 /*
  * ZTRSM kernel declaration
  */
@@ -248,41 +237,6 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
 #define DIAG_ELE_EVAL_OPS(a,b) (a / b)
 #endif
 
-/*
- * Reference implementations
- * ToDo: We can combine all these reference implementation
-         into a macro
-*/
-//A'X = B;  A is upper triangular; transpose;
-//non-unitDiagonal double precision
-BLIS_INLINE err_t dtrsm_AutXB_ref
-(
-    double *A,
-    double *B,
-    dim_t M,
-    dim_t N,
-    dim_t lda,
-    dim_t ldb,
-    bool unitDiagonal
-)
-{
-    dim_t i, j, k;
-    for (k = 0; k < M; k++)
-    {
-        double lkk_inv = 1.0;
-        if(!unitDiagonal) lkk_inv = DIAG_ELE_INV_OPS(lkk_inv,A[k+k*lda]);
-        for (j = 0; j < N; j++)
-        {
-            B[k + j*ldb] = DIAG_ELE_EVAL_OPS(B[k + j*ldb] , lkk_inv);
-            for (i = k+1; i < M; i++)
-            {
-                B[i + j*ldb] -= A[i*lda + k] * B[k + j*ldb];
-            }
-        }
-    }// k -loop
-    return BLIS_SUCCESS;
-}// end of function
-
 /*
  * Reference implementations
  * ToDo: We can combine all these reference implementation
@@ -318,37 +272,6 @@ BLIS_INLINE err_t strsm_AutXB_ref
     return BLIS_SUCCESS;
 }// end of function
 
-/* TRSM scalar code for the case AX = alpha * B
- * A is upper-triangular, non-unit-diagonal
- * Dimensions:  A: mxm   X: mxn B:mxn
- */
-BLIS_INLINE err_t dtrsm_AuXB_ref
-(
-    double *A,
-    double *B,
-    dim_t M,
-    dim_t N,
-    dim_t lda,
-    dim_t ldb,
-    bool is_unitdiag
-)
-{
-    dim_t i, j, k;
-    for (k = M-1; k >= 0; k--)
-    {
-        double lkk_inv = 1.0;
-        if(!is_unitdiag) lkk_inv = DIAG_ELE_INV_OPS(lkk_inv,A[k+k*lda]);
-        for (j = N -1; j >= 0; j--)
-        {
-            B[k + j*ldb] = DIAG_ELE_EVAL_OPS(B[k + j*ldb],lkk_inv);
-            for (i = k-1; i >=0; i--)
-            {
-                B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb];
-            }
-        }
-    }// k -loop
-    return BLIS_SUCCESS;
-}// end of function
 
 /* TRSM scalar code for the case AX = alpha * B
  * A is upper-triangular, non-unit-diagonal
@@ -382,37 +305,6 @@ BLIS_INLINE err_t strsm_AuXB_ref
     return BLIS_SUCCESS;
 }// end of function
 
-/* TRSM scalar code for the case AX = alpha * B
- * A is lower-triangular, non-unit-diagonal, no transpose
- * Dimensions:  A: mxm   X: mxn B:mxn
- */
-BLIS_INLINE err_t dtrsm_AlXB_ref
-(
-    double *A,
-    double *B,
-    dim_t M,
-    dim_t N,
-    dim_t lda,
-    dim_t ldb,
-    bool is_unitdiag
-)
-{
-    dim_t i, j, k;
-    for (k = 0; k < M; k++)
-    {
-        double lkk_inv = 1.0;
-        if(!is_unitdiag) lkk_inv = DIAG_ELE_INV_OPS(lkk_inv,A[k+k*lda]);
-        for (j = 0; j < N; j++)
-        {
-            B[k + j*ldb] = DIAG_ELE_EVAL_OPS(B[k + j*ldb],lkk_inv);
-            for (i = k+1; i < M; i++)
-            {
-                B[i + j*ldb] -= A[i + k*lda] * B[k + j*ldb];
-            }
-        }
-    }// k -loop
-    return BLIS_SUCCESS;
-}// end of function
 
 /* TRSM scalar code for the case AX = alpha * B
  * A is lower-triangular, non-unit-diagonal, no transpose
@@ -446,38 +338,6 @@ BLIS_INLINE err_t strsm_AlXB_ref
     return BLIS_SUCCESS;
 }// end of function
 
-/* TRSM scalar code for the case AX = alpha * B
- * A is lower-triangular, non-unit-diagonal, transpose
- * Dimensions:  A: mxm   X: mxn B:mxn
- */
-BLIS_INLINE err_t dtrsm_AltXB_ref
-(
-    double *A,
-    double *B,
-    dim_t M,
-    dim_t N,
-    dim_t lda,
-    dim_t ldb,
-    bool is_unitdiag
-)
-{
-    dim_t i, j, k;
-    for (k = M-1; k >= 0; k--)
-    {
-        double lkk_inv = 1.0;
-        if(!is_unitdiag) lkk_inv = DIAG_ELE_INV_OPS(lkk_inv,A[k+k*lda]);
-        for (j = N -1; j >= 0; j--)
-        {
-            B[k + j*ldb] = DIAG_ELE_EVAL_OPS(B[k + j*ldb],lkk_inv);
-            for (i = k-1; i >=0; i--)
-            {
-                B[i + j*ldb] -= A[i*lda + k] * B[k + j*ldb];
-            }
-        }
-    }// k -loop
-    return BLIS_SUCCESS;
-}// end of function
-
 /* TRSM scalar code for the case AX = alpha * B
  * A is lower-triangular, non-unit-diagonal, transpose
  * Dimensions:  A: mxm   X: mxn B:mxn
@@ -1439,8 +1299,8 @@ BLIS_INLINE err_t dtrsm_XAltB_ref
 
 /*
    Load b11 of size 6x8 and multiply with alpha
-   Add the GEMM output and perform inregister transose of b11
-   to peform DTRSM operation for left cases.
+   Add the GEMM output and perform in register transpose of b11
+   to perform DTRSM operation for left cases.
 */
 #define BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal) \
         ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal));\
@@ -4684,8 +4544,8 @@ BLIS_INLINE err_t dtrsm_XAltB_ref
 
 /*
    Load b11 of size 6x8 and multiply with alpha
-   Add the GEMM output and perform inregister transose of b11
-   to peform DTRSM operation for left cases.
+   Add the GEMM output and perform in register transpose of b11
+   to perform DTRSM operation for left cases.
 */
 #define BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal) \
         ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));\
@@ -5119,7 +4979,7 @@ BLIS_INLINE void bli_dtrsm_small_pack
 }
 /*
     Pack diagonal elements of A block (8 or 6) into an array
-    a. This helps in utilze cache line efficiently in TRSM operation
+    a. This helps to utilize cache line efficiently in TRSM operation
     b. store ones when input is unit diagonal
 */
 BLIS_INLINE void dtrsm_small_pack_diag_element
@@ -5244,7 +5104,8 @@ err_t bli_trsm_small
     obj_t*  a,
     obj_t*  b,
     cntx_t* cntx,
-    cntl_t* cntl
+    cntl_t* cntl,
+    bool is_parallel
 )
 {
     err_t err;
@@ -5263,25 +5124,19 @@ err_t bli_trsm_small
    {
         case BLIS_DOUBLE:
         {
-            bool nt = bli_thread_get_is_parallel();
-            if((nt == 0) && (m > 1000 || n > 1000)) {
-                return BLIS_NOT_YET_IMPLEMENTED;
-            }
             break;
         }
         case BLIS_FLOAT:
         case BLIS_SCOMPLEX:
         {
-            bool nt = bli_thread_get_is_parallel();
-            if((nt == 0) && (m > 1000 || n > 1000)) {
+            if((!is_parallel) && (m > 1000 || n > 1000)) {
                return BLIS_NOT_YET_IMPLEMENTED;
             }
             break;
         }
         case BLIS_DCOMPLEX:
         {
-            bool nt = bli_thread_get_is_parallel();
-            if((nt == 0) && (m > 500 || n > 500)) {
+            if((!is_parallel) && (m > 500 || n > 500)) {
                 return BLIS_NOT_YET_IMPLEMENTED;
             }
             break;
@@ -5347,7 +5202,8 @@ err_t bli_trsm_small_mt
     obj_t*  a,
     obj_t*  b,
     cntx_t* cntx,
-    cntl_t* cntl
+    cntl_t* cntl,
+    bool    is_parallel
 )
 {
     gint_t m = bli_obj_length( b ); // number of rows of matrix b
@@ -5397,65 +5253,92 @@ err_t bli_trsm_small_mt
     {
         // Query the thread's id from OpenMP.
         const dim_t tid = omp_get_thread_num();
+        const dim_t nt_real = omp_get_num_threads();
 
-        obj_t      b_t;
-        dim_t start; // Each thread start Index
-        dim_t end;   // Each thread end Index
-        thrinfo_t thread;
+        if(nt_real != n_threads)
+        {
+            if(tid == 0)
+            {
+                bli_trsm_small
+                (
+                  side,
+                  alpha,
+                  a,
+                  b,
+                  cntx,
+                  cntl,
+                  is_parallel
+                );
+            }
+        }
+        else
+        {
+            obj_t     b_t;
+            dim_t     start; // Each thread start Index
+            dim_t     end;   // Each thread end Index
+            thrinfo_t thread;
 
-        thread.n_way    = n_threads;
-        thread.work_id  = tid;
-        thread.ocomm_id = tid;
+            thread.n_way    = n_threads;
+            thread.work_id  = tid;
+            thread.ocomm_id = tid;
 
 
-        // Compute start and end indexes of matrix partitioning for each thread
-        if ( bli_is_right( side ) )
-        {
-            bli_thread_range_sub (  &thread,
+            // Compute start and end indexes of matrix partitioning for each thread
+            if ( bli_is_right( side ) )
+            {
+                bli_thread_range_sub 
+                (  
+                  &thread,
                   m,
                   d_mr,// Need to decide based on type
                   FALSE,
                   &start,
                   &end
-                   );
-            // For each thread acquire matrix block on which they operate
-            // Data-based parallelism
+                );
+                // For each thread acquire matrix block on which they operate
+                // Data-based parallelism
 
-            bli_acquire_mpart_mdim(BLIS_FWD, BLIS_SUBPART1, start, end-start, b, &b_t);
+                bli_acquire_mpart_mdim(BLIS_FWD, BLIS_SUBPART1, start, end-start, b, &b_t);
+            }
+            else
+            {
+                bli_thread_range_sub
+                ( 
+                  &thread,
+                  n,
+                  d_nr,// Need to decide based on type
+                  FALSE,
+                  &start,
+                  &end
+                );
+                // For each thread acquire matrix block on which they operate
+                // Data-based parallelism
+
+                bli_acquire_mpart_ndim(BLIS_FWD, BLIS_SUBPART1, start, end-start, b, &b_t);
+            }
+
+            // Parallelism is only across m-dimension/n-dimension - therefore matrix a is common to
+            // all threads
+            err_t status_l = BLIS_SUCCESS;
+
+            status_l = bli_trsm_small
+                        (
+                          side,
+                          alpha,
+                          a,
+                          &b_t,
+                          NULL,
+                          NULL,
+                          is_parallel
+                        );
+	        // To capture the error populated from any of the threads
+            if ( status_l != BLIS_SUCCESS )
+            {
+                _Pragma("omp critical")
+                status = (status != BLIS_NOT_YET_IMPLEMENTED) ? status_l : status;
+            }
         }
-        else
-        {
-            bli_thread_range_sub (  &thread,
-                   n,
-                   d_nr,// Need to decide based on type
-                   FALSE,
-                   &start,
-                   &end
-                    );
-            // For each thread acquire matrix block on which they operate
-            // Data-based parallelism
-
-            bli_acquire_mpart_ndim(BLIS_FWD, BLIS_SUBPART1, start, end-start, b, &b_t);
-        }
-
-        // Parallelism is only across m-dimension/n-dimension - therefore matrix a is common to
-        // all threads
-        err_t status_l = BLIS_SUCCESS;
-
-        status_l = bli_trsm_small
-                   (
-		     side,
-                     alpha,
-                     a,
-                     &b_t,
-                     NULL,
-                     NULL
-                   );
-	// To capture the error populated from any of the threads
-        _Pragma( "omp critical" )
-	status = (status != BLIS_NOT_YET_IMPLEMENTED)?status_l:status;
     }
-
     return status;
 }// End of function
 #endif
@@ -7771,8 +7654,8 @@ BLIS_INLINE err_t ztrsm_AuXB_ref
 
 /*
  * Load b11 of size 3x4 and multiply with alpha
- * Add the GEMM output and perform inregister transose of b11
- * to peform ZTRSM operation for left cases.
+ * Add the GEMM output and perform in register transpose of b11
+ * to perform ZTRSM operation for left cases.
  */
 #define BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal) {\
     ymm16 = _mm256_broadcast_pd(( __m128d const *)(&AlphaVal));\
@@ -8305,6 +8188,16 @@ BLIS_INLINE void ztrsm_small_pack_diag_element
     dim_t size
 )
 {
+    if ( is_unitdiag )
+    {
+        dcomplex ones = {1.0, 0.0};
+        for( dim_t i = 0; i < size; i++)
+        {
+            d11_pack[i].real = ones.real;
+            d11_pack[i].imag = ones.imag;
+        }
+        return;
+    }
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
     // If Preinversion is enabled, inverse the diaganol
     // elements from A and pack into diagonal buffer.
@@ -8382,7 +8275,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
     bool transa = bli_obj_has_trans(a);
     dim_t cs_a, rs_a;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -8402,7 +8295,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
     bool is_unitdiag = bli_obj_has_unit_diag(a);
 
     double AlphaVal = *(double *)AlphaObj->buffer;    //value of Alpha
-    double* restrict L = a->buffer;      //pointer to matrix A
+    double* restrict L = bli_obj_buffer_at_off(a);      //pointer to matrix A
     double *B =  bli_obj_buffer_at_off(b);       //pointer to matrix B
 
     double *a01, *a11, *b10, *b11;   //pointers for GEMM and TRSM blocks
@@ -8447,6 +8340,13 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
 
     xmm5 = _mm_setzero_pd();
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm5 = _mm256_setzero_pd();
+    ymm0 = _mm256_setzero_pd();
+
+
+
     /*
     Performs solving TRSM for 6 rows at a time from  0 to n/6 in steps of d_nr
     a. Load and pack A (a01 block), the size of packing 6x6 to 6x (n-6)
@@ -8479,7 +8379,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
 
             /*
                Pack 6 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
 
@@ -8512,8 +8412,8 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-            Peform GEMM between a01 and b10 blocks
-            For first itteration there will be no GEMM operation
+            Perform GEMM between a01 and b10 blocks
+            For first iteration there will be no GEMM operation
             where k_iter are zero
             */
             BLIS_DTRSM_SMALL_GEMM_6nx8m(a01,b10,cs_b,p_lda,k_iter)
@@ -8521,7 +8421,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAltB_XAuB
             /*
             Load b11 of size 8x6 and multiply with alpha
             Add the GEMM output to b11
-            and peform TRSM operation.
+            and perform TRSM operation.
             */
 
             BLIS_PRE_DTRSM_SMALL_6x8(AlphaVal,b11,cs_b)
@@ -10784,7 +10684,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
     dim_t cs_a, rs_a;
     dim_t d_mr = 8,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -10805,7 +10705,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
     bool is_unitdiag = bli_obj_has_unit_diag(a);
 
     double AlphaVal = *(double *)AlphaObj->buffer;    //value of Alpha
-    double* restrict L = a->buffer;      //pointer to matrix A
+    double* restrict L = bli_obj_buffer_at_off(a);      //pointer to matrix A
     double *B =  bli_obj_buffer_at_off(b);       //pointer to matrix B
 
     double *a01, *a11, *b10, *b11;   //pointers for GEMM and TRSM blocks
@@ -10850,6 +10750,12 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
 
     xmm5 = _mm_setzero_pd();
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_pd();
+    ymm6 = _mm256_setzero_pd();
+
+
     /*
     Performs solving TRSM for 6 rows at a time from  0 to n/6 in steps of d_nr
     a. Load and pack A (a01 block), the size of packing 6x6 to 6x (n-6)
@@ -10882,7 +10788,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
 
             /*
                Pack 6 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
@@ -10914,8 +10820,8 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-            Peform GEMM between a01 and b10 blocks
-            For first itteration there will be no GEMM operation
+            Perform GEMM between a01 and b10 blocks
+            For first iteration there will be no GEMM operation
             where k_iter are zero
             */
 
@@ -10924,7 +10830,7 @@ BLIS_INLINE  err_t bli_dtrsm_small_XAutB_XAlB
             /*
             Load b11 of size 8x6 and multiply with alpha
             Add the GEMM output to b11
-            and peform TRSM operation.
+            and perform TRSM operation.
             */
 
             BLIS_PRE_DTRSM_SMALL_6x8(AlphaVal,b11,cs_b)
@@ -13104,7 +13010,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 8,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -13121,7 +13027,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
     dim_t k_iter;                         //number of times GEMM to be performed
 
     double AlphaVal = *(double *)AlphaObj->buffer;    //value of alpha
-    double *L =  a->buffer;               //pointer to  matrix A
+    double *L =  bli_obj_buffer_at_off(a);               //pointer to  matrix A
     double *B =  bli_obj_buffer_at_off(b);       //pointer to matrix B
 
     //pointers that point to blocks for GEMM and TRSM
@@ -13140,6 +13046,14 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
 
     __m128d xmm5;
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_pd();
+    ymm1 = _mm256_setzero_pd();
+    ymm2 = _mm256_setzero_pd();
+    ymm3 = _mm256_setzero_pd();
+
+
     xmm5 = _mm_setzero_pd();
 
     gint_t required_packing_A = 1;
@@ -13173,7 +13087,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
     }
 
     /*
-        Performs solving TRSM for 8 colmns at a time from  0 to m/d_mr in steps of d_mr
+        Performs solving TRSM for 8 columns at a time from 0 to m/d_mr in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-d_mr)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -13195,15 +13109,15 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
             /*
               Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 8x(m-8) which is the maximum GEMM alone block size in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_dtrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
 
                /*
                Pack 8 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -13220,7 +13134,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
             c. This loop GEMM+TRSM loops operates with 8x6 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         for(j = (n - d_nr); (j + 1) > 0; j -= d_nr)
         {
@@ -13234,16 +13148,16 @@ BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-                Peform GEMM between a10 and b01 blocks
-                For first itteration there will be no GEMM operation
+                Perform GEMM between a10 and b01 blocks
+                For first iteration there will be no GEMM operation
                 where k_iter are zero
             */
             BLIS_DTRSM_SMALL_GEMM_8mx6n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x8 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal)
 
@@ -15109,7 +15023,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 8,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -15126,7 +15040,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
     dim_t k_iter;     //number of times GEMM to be performed
 
     double AlphaVal = *(double *)AlphaObj->buffer;    //value of alpha
-    double *L =  a->buffer;       //pointer to  matrix A
+    double *L =  bli_obj_buffer_at_off(a);       //pointer to  matrix A
     double *B =  bli_obj_buffer_at_off(b);       //pointer to matrix B
 
     double *a10, *a11, *b01, *b11;    //pointers that point to blocks for GEMM and TRSM
@@ -15144,6 +15058,14 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
 
     __m128d xmm5;
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_pd();
+    ymm1 = _mm256_setzero_pd();
+    ymm2 = _mm256_setzero_pd();
+    ymm3 = _mm256_setzero_pd();
+
+
     xmm5 = _mm_setzero_pd();
 
     gint_t required_packing_A = 1;
@@ -15177,7 +15099,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
     }
 
     /*
-        Performs solving TRSM for 8 colmns at a time from  0 to m/8 in steps of d_mr
+        Performs solving TRSM for 8 columns at a time from 0 to m/8 in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-8)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -15193,17 +15115,17 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
         if(transa)
         {
             /*
-              Load, tranpose and pack current A block (a10) into packed buffer memory D_A_pack
+              Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 8x(m-8) which is the maximum GEMM alone block size in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_dtrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
 
             /*
                Pack 8 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             dtrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -15220,7 +15142,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
             c. This loop GEMM+TRSM loops operates with 8x6 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         dim_t temp = n - d_nr + 1;
         for(j = 0; j < temp; j += d_nr)   //loop along 'N' dimension
@@ -15236,16 +15158,16 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-              Peform GEMM between a10 and b01 blocks
-              For first itteration there will be no GEMM operation
+              Perform GEMM between a10 and b01 blocks
+              For first iteration there will be no GEMM operation
               where k_iter are zero
             */
             BLIS_DTRSM_SMALL_GEMM_8mx6n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x8 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_DTRSM_SMALL_NREG_TRANSPOSE_6x8(b11,cs_b,AlphaVal)
 
@@ -17147,7 +17069,7 @@ BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB
 
 /*
     Pack diagonal elements of A block (16 or 6) into an array
-    a. This helps in utilze cache line efficiently in TRSM operation
+    a. This helps to utilize cache line efficiently in TRSM operation
     b. store ones when input is unit diagonal
 */
 BLIS_INLINE void strsm_small_pack_diag_element
@@ -17708,7 +17630,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
     dim_t cs_a, rs_a;
     dim_t d_mr = 16,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -17729,8 +17651,8 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
     bool is_unitdiag = bli_obj_has_unit_diag(a);
 
     float AlphaVal = *(float *)AlphaObj->buffer;    //value of Alpha
-    float* restrict L = a->buffer;      //pointer to matrix A
-    float* restrict B = b->buffer;      //pointer to matrix B
+    float* restrict L = bli_obj_buffer_at_off(a);      //pointer to matrix A
+    float* restrict B = bli_obj_buffer_at_off(b);      //pointer to matrix B
 
     float *a01, *a11, *b10, *b11;   //pointers for GEMM and TRSM blocks
 
@@ -17774,6 +17696,10 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
 
     xmm5 = _mm_setzero_ps();
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_ps();
+
     /*
     Performs solving TRSM for 6 rows at a time from  0 to n/6 in steps of d_nr
     a. Load and pack A (a01 block), the size of packing 6x6 to 6x (n-6)
@@ -17804,7 +17730,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
 
             /*
                Pack 6 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             strsm_small_pack_diag_element('R',is_unitdiag,a11,cs_a,d11_pack,d_nr);
@@ -17836,8 +17762,8 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
             BLIS_SET_S_YMM_REG_ZEROS
 
             /*
-            Peform GEMM between a01 and b10 blocks
-            For first itteration there will be no GEMM operation
+            Perform GEMM between a01 and b10 blocks
+            For first iteration there will be no GEMM operation
             where k_iter are zero
             */
             BLIS_STRSM_SMALL_GEMM_6nx16m(a01,b10,cs_b,p_lda,k_iter)
@@ -17845,7 +17771,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAutB_XAlB
             /*
             Load b11 of size 16x6 and multiply with alpha
             Add the GEMM output to b11
-            and peform TRSM operation.
+            and perform TRSM operation.
             */
 
             BLIS_PRE_STRSM_SMALL_6x16(AlphaVal,b11,cs_b)
@@ -21373,7 +21299,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
     bool transa = bli_obj_has_trans(a);
     dim_t cs_a, rs_a;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -21394,8 +21320,8 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
     bool is_unitdiag = bli_obj_has_unit_diag(a);
 
     float AlphaVal = *(float *)AlphaObj->buffer;    //value of Alpha
-    float* restrict L = a->buffer;      //pointer to matrix A
-    float* restrict B = b->buffer;      //pointer to matrix B
+    float* restrict L = bli_obj_buffer_at_off(a);      //pointer to matrix A
+    float* restrict B = bli_obj_buffer_at_off(b);      //pointer to matrix B
 
     float *a01, *a11, *b10, *b11;   //pointers for GEMM and TRSM blocks
 
@@ -21438,6 +21364,10 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
     __m128 xmm5;
 
     xmm5 = _mm_setzero_ps();
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_ps();
+
 
     /*
     Performs solving TRSM for 6 rows at a time from  0 to n/6 in steps of d_nr
@@ -21471,7 +21401,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
 
             /*
                Pack 6 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
 
@@ -21504,8 +21434,8 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
             BLIS_SET_S_YMM_REG_ZEROS
 
             /*
-            Peform GEMM between a01 and b10 blocks
-            For first itteration there will be no GEMM operation
+            Perform GEMM between a01 and b10 blocks
+            For first iteration there will be no GEMM operation
             where k_iter are zero
             */
             BLIS_STRSM_SMALL_GEMM_6nx16m(a01,b10,cs_b,p_lda,k_iter)
@@ -21513,7 +21443,7 @@ BLIS_INLINE  err_t bli_strsm_small_XAltB_XAuB
             /*
             Load b11 of size 16x6 and multiply with alpha
             Add the GEMM output to b11
-            and peform TRSM operation.
+            and perform TRSM operation.
             */
 
             BLIS_PRE_STRSM_SMALL_6x16(AlphaVal,b11,cs_b)
@@ -25216,7 +25146,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 16,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -25233,8 +25163,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
     dim_t k_iter;     //number of times GEMM to be performed
 
     float AlphaVal = *(float *)AlphaObj->buffer;    //value of alpha
-    float *L =  a->buffer;       //pointer to  matrix A
-    float *B =  b->buffer;       //pointer to matrix B
+    float *L =  bli_obj_buffer_at_off(a);       //pointer to  matrix A
+    float *B =  bli_obj_buffer_at_off(b);       //pointer to matrix B
 
     float *a10, *a11, *b01, *b11;    //pointers that point to blocks for GEMM and TRSM
 
@@ -25251,6 +25181,20 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
     __m256 ymm16, ymm17, ymm18, ymm19;
     __m256 ymm20,ymm21,ymm22;
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_ps();
+    ymm1 = _mm256_setzero_ps();
+    ymm2 = _mm256_setzero_ps();
+    ymm3 = _mm256_setzero_ps();
+    ymm17 = _mm256_setzero_ps();
+    ymm18 = _mm256_setzero_ps();
+    ymm19 = _mm256_setzero_ps();
+    ymm20 = _mm256_setzero_ps();
+    ymm21 = _mm256_setzero_ps();
+    ymm22 = _mm256_setzero_ps();
+
+
     gint_t required_packing_A = 1;
     mem_t local_mem_buf_A_s = {0};
     float *D_A_pack = NULL;
@@ -25282,7 +25226,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
     }
 
     /*
-        Performs solving TRSM for 16 colmns at a time from  0 to m/16 in steps of d_mr
+        Performs solving TRSM for 16 columns at a time from  0 to m/16 in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 16x6 to 16x (m-16)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -25298,17 +25242,17 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
         if(transa)
         {
             /*
-              Load, tranpose and pack current A block (a10) into packed buffer memory D_A_pack
+              Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 16x(m-16) which is the maximum GEMM alone block size in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 16x(m-16) which is the maximum GEMM alone block size in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_strsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
 
             /*
                Pack 16 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -25325,7 +25269,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
             c. This loop GEMM+TRSM loops operates with 16x6 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         dim_t temp = n - d_nr + 1;
         for(j = 0; j < temp; j += d_nr)   //loop along 'N' dimension
@@ -25341,16 +25285,16 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
             BLIS_SET_S_YMM_REG_ZEROS
 
             /*
-              Peform GEMM between a10 and b01 blocks
-              For first itteration there will be no GEMM operation
+              Perform GEMM between a10 and b01 blocks
+              For first iteration there will be no GEMM operation
               where k_iter are zero
             */
             BLIS_STRSM_SMALL_GEMM_16mx6n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x16 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal)
 
@@ -25876,8 +25820,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
             /*
                Load b11 of size 6x16 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
             ymm0 = _mm256_broadcast_ss((float const *)(&zero));
@@ -26458,7 +26402,7 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
             a11 += rs_a;
 
-            // N-register tranpose and store
+            // N-register transpose and store
 
             ymm0 = _mm256_unpacklo_ps(ymm10, ymm11);
             ymm1 = _mm256_unpacklo_ps(ymm17, ymm18);
@@ -26555,8 +26499,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
                 /*
                 Load b11 of size 6x16 and multiply with alpha
-                Add the GEMM output and perform inregister transose of b11
-                to peform TRSM operation.
+                Add the GEMM output and perform in register transpose of b11
+                to perform TRSM operation.
                 */
                 ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
                 ymm0 = _mm256_broadcast_ss((float const *)(&zero));
@@ -26640,8 +26584,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
                 /*
                 Load b11 of size 6x16 and multiply with alpha
-                Add the GEMM output and perform inregister transose of b11
-                to peform TRSM operation.
+                Add the GEMM output and perform in register transpose of b11
+                to perform TRSM operation.
                 */
                 ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
                 ymm0 = _mm256_broadcast_ss((float const *)(&zero));
@@ -26723,8 +26667,8 @@ BLIS_INLINE err_t bli_strsm_small_AutXB_AlXB
 
                 /*
                 Load b11 of size 6x16 and multiply with alpha
-                Add the GEMM output and perform inregister transose of b11
-                to peform TRSM operation.
+                Add the GEMM output and perform in register transpose of b11
+                to perform TRSM operation.
                 */
                 ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
                 ymm0 = _mm256_broadcast_ss((float const *)(&zero));
@@ -29582,7 +29526,7 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 16,d_nr = 6;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -29599,8 +29543,8 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
     dim_t k_iter;                         //number of times GEMM to be performed
 
     float AlphaVal = *(float *)AlphaObj->buffer;    //value of alpha
-    float *L =  a->buffer;               //pointer to  matrix A
-    float *B =  b->buffer;               //pointer to matrix B
+    float *L =  bli_obj_buffer_at_off(a);               //pointer to  matrix A
+    float *B =  bli_obj_buffer_at_off(b);               //pointer to matrix B
 
     //pointers that point to blocks for GEMM and TRSM
     float *a10, *a11, *b01, *b11;
@@ -29617,6 +29561,20 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
     __m256 ymm16, ymm17, ymm18, ymm19;
     __m256 ymm20, ymm21, ymm22;
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_ps();
+    ymm1 = _mm256_setzero_ps();
+    ymm2 = _mm256_setzero_ps();
+    ymm17 = _mm256_setzero_ps();
+    ymm18 = _mm256_setzero_ps();
+    ymm19 = _mm256_setzero_ps();
+    ymm20 = _mm256_setzero_ps();
+    ymm21 = _mm256_setzero_ps();
+    ymm22 = _mm256_setzero_ps();
+
+
+
     gint_t required_packing_A = 1;
     mem_t local_mem_buf_A_s = {0};
     float *D_A_pack = NULL;
@@ -29670,15 +29628,15 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
             /*
               Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 16x(m-16) which is the maximum GEMM alone block size in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 16x(m-16) which is the maximum GEMM alone block size in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_strsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
 
                /*
                Pack 8 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             strsm_small_pack_diag_element('L',is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -29709,16 +29667,16 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
             BLIS_SET_S_YMM_REG_ZEROS
 
             /*
-                Peform GEMM between a10 and b01 blocks
-                For first itteration there will be no GEMM operation
+                Perform GEMM between a10 and b01 blocks
+                For first iteration there will be no GEMM operation
                 where k_iter are zero
             */
             BLIS_STRSM_SMALL_GEMM_16mx6n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x16 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_STRSM_SMALL_NREG_TRANSPOSE_6x16(b11,cs_b,AlphaVal)
 
@@ -31829,8 +31787,8 @@ BLIS_INLINE err_t bli_strsm_small_AltXB_AuXB
 
             /*
                Load b11 of size 6x8 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             ymm16 = _mm256_broadcast_ss((float const *)(&AlphaVal));
 
@@ -33723,7 +33681,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 4,d_nr = 3;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -33740,8 +33698,8 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
     dim_t k_iter;     //number of times GEMM to be performed
 
     dcomplex AlphaVal = *(dcomplex *)AlphaObj->buffer;    //value of alpha
-    dcomplex *L =  a->buffer;       //pointer to  matrix A
-    dcomplex *B =  b->buffer;       //pointer to matrix B
+    dcomplex *L =  bli_obj_buffer_at_off(a);       //pointer to  matrix A
+    dcomplex *B =  bli_obj_buffer_at_off(b);       //pointer to matrix B
 
     dcomplex *a10, *a11, *b01, *b11;    //pointers that point to blocks for GEMM and TRSM
 
@@ -33760,6 +33718,15 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
     xmm4 = _mm_setzero_pd();
     xmm5 = _mm_setzero_pd();
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_pd();
+    ymm1 = _mm256_setzero_pd();
+    ymm2 = _mm256_setzero_pd();
+    ymm10 = _mm256_setzero_pd();
+    ymm11 = _mm256_setzero_pd();
+
+
     gint_t required_packing_A = 1;
     mem_t local_mem_buf_A_s = {0};
     dcomplex *D_A_pack = NULL;
@@ -33791,7 +33758,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
     }
 
     /*
-        Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+        Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -33807,19 +33774,19 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
         if(transa)
         {
             /*
-              Load, tranpose and pack current A block (a10) into packed buffer memory
-          D_A_pack
+              Load, transpose and pack current A block (a10) into packed buffer memory
+              D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 4x(m-4) which is the maximum GEMM alone block size
-         in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 4x(m-4) which is the maximum GEMM alone block size
+                 in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_ztrsm_small_pack('L', i, 1, a10, cs_a, D_A_pack, p_lda,d_mr);
 
             /*
                Pack 4 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -33835,7 +33802,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
             c. This loop GEMM+TRSM loops operates with 4x3 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         dim_t temp = n - d_nr + 1;
         for(j = 0; j < temp; j += d_nr)   //loop along 'N' dimension
@@ -33851,16 +33818,16 @@ BLIS_INLINE err_t bli_ztrsm_small_AutXB_AlXB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-              Peform GEMM between a10 and b01 blocks
-              For first itteration there will be no GEMM operation
+              Perform GEMM between a10 and b01 blocks
+              For first iteration there will be no GEMM operation
               where k_iter are zero
             */
             BLIS_ZTRSM_SMALL_GEMM_4mx3n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 3x4 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal)
         /*
@@ -34956,7 +34923,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
     dim_t cs_a, rs_a;
     dim_t d_mr = 4,d_nr = 3;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -34973,8 +34940,8 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
     dim_t k_iter;                         //number of times GEMM to be performed
 
     dcomplex AlphaVal = *(dcomplex *)AlphaObj->buffer;    //value of alpha
-    dcomplex *L =  a->buffer;               //pointer to  matrix A
-    dcomplex *B =  b->buffer;               //pointer to matrix B
+    dcomplex *L =  bli_obj_buffer_at_off(a);               //pointer to  matrix A
+    dcomplex *B =  bli_obj_buffer_at_off(b);               //pointer to matrix B
 
     //pointers that point to blocks for GEMM and TRSM
     dcomplex *a10, *a11, *b01, *b11;
@@ -34994,6 +34961,14 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
     xmm4 = _mm_setzero_pd();
     xmm5 = _mm_setzero_pd();
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_pd();
+    ymm1 = _mm256_setzero_pd();
+    ymm2 = _mm256_setzero_pd();
+    ymm10 = _mm256_setzero_pd();
+    ymm11 = _mm256_setzero_pd();
+
     gint_t required_packing_A = 1;
     mem_t local_mem_buf_A_s = {0};
     dcomplex *D_A_pack = NULL;
@@ -35025,7 +35000,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
     }
 
     /*
-        Performs solving TRSM for 4 colmns at a time from  0 to m/d_mr in steps of d_mr
+        Performs solving TRSM for 4 columns at a time from  0 to m/d_mr in steps of d_mr
         a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-d_mr)
            First there will be no GEMM and no packing of a10 because it is only TRSM
         b. Using packed a10 block and b01 block perform GEMM operation
@@ -35046,18 +35021,18 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
         {
             /*
               Load, transpose and pack current A block (a10) into packed buffer memory
-          D_A_pack
+              D_A_pack
               a. This a10 block is used in GEMM portion only and this
-                 a10 block size will be increasing by d_mr for every next itteration
-                 untill it reaches 4x(m-4) which is the maximum GEMM alone block size
-         in A
+                 a10 block size will be increasing by d_mr for every next iteration
+                 until it reaches 4x(m-4) which is the maximum GEMM alone block size
+                 in A
               b. This packed buffer is reused to calculate all n rows of B matrix
             */
             bli_ztrsm_small_pack('L', (m-i-d_mr), 1, a10, cs_a, D_A_pack,p_lda,d_mr);
 
                /*
                Pack 8 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM operation
+               a. This helps to utilize cache line efficiently in TRSM operation
                b. store ones when input is unit diagonal
             */
             ztrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -35074,7 +35049,7 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
             c. This loop GEMM+TRSM loops operates with 8x6 block size
                along n dimension for every d_nr rows of b01 where
                packed A buffer is reused in computing all n rows of B.
-            d. Same approch is used in remaining fringe cases.
+            d. Same approach is used in remaining fringe cases.
         */
         for(j = (n - d_nr); (j + 1) > 0; j -= d_nr)
         {
@@ -35088,16 +35063,16 @@ BLIS_INLINE err_t bli_ztrsm_small_AltXB_AuXB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-                Peform GEMM between a10 and b01 blocks
-                For first itteration there will be no GEMM operation
+                Perform GEMM between a10 and b01 blocks
+                For first iteration there will be no GEMM operation
                 where k_iter are zero
             */
             BLIS_ZTRSM_SMALL_GEMM_4mx3n(a10,b01,cs_b,p_lda,k_iter)
 
             /*
                Load b11 of size 6x8 and multiply with alpha
-               Add the GEMM output and perform inregister transose of b11
-               to peform TRSM operation.
+               Add the GEMM output and perform in register transpose of b11
+               to perform TRSM operation.
             */
             BLIS_ZTRSM_SMALL_NREG_TRANSPOSE_3x4(b11,cs_b,AlphaVal)
 
@@ -36180,7 +36155,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
     dim_t cs_a, rs_a;
     dim_t d_mr = 4,d_nr = 3;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -36201,8 +36176,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
     bool is_unitdiag = bli_obj_has_unit_diag(a);
 
     dcomplex AlphaVal = *(dcomplex *)AlphaObj->buffer;    //value of Alpha
-    dcomplex* restrict L = a->buffer;      //pointer to matrix A
-    dcomplex* restrict B = b->buffer;      //pointer to matrix B
+    dcomplex* restrict L = bli_obj_buffer_at_off(a);      //pointer to matrix A
+    dcomplex* restrict B = bli_obj_buffer_at_off(b);      //pointer to matrix B
 
     dcomplex *a01, *a11, *b10, *b11;   //pointers for GEMM and TRSM blocks
 
@@ -36246,6 +36221,10 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
     __m128d xmm5;
 
     xmm5 = _mm_setzero_pd();
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm1= _mm256_setzero_pd();
+
 
     for(j = (n-d_nr); (j+1) > 0; j -= d_nr)     //loop along 'N' direction
     {
@@ -36271,7 +36250,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
 
             /*
                Pack 3 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM
+               a. This helps to utilize cache line efficiently in TRSM
                operation
                b. store ones when input is unit diagonal
                */
@@ -36307,8 +36286,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
 
@@ -36317,7 +36296,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             /*
                Load b11 multiply with alpha
                Add the GEMM output to b11
-               and peform TRSM operation.
+               and perform TRSM operation.
                */
 
             BLIS_PRE_ZTRSM_SMALL_3x4(AlphaVal,b11,cs_b)
@@ -36490,8 +36469,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                  * accumulations */
                 BLIS_SET_YMM_REG_ZEROS
                 /*
-                   Peform GEMM between a01 and b10 blocks
-                   For first itteration there will be no GEMM operation
+                   Perform GEMM between a01 and b10 blocks
+                   For first iteration there will be no GEMM operation
                    where k_iter are zero
                    */
 
@@ -36500,7 +36479,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                 /*
                    Load b11 multiply with alpha
                    Add the GEMM output to b11
-                   and peform TRSM operation.
+                   and perform TRSM operation.
                    */
 
                 BLIS_PRE_ZTRSM_SMALL_3x3(AlphaVal,b11,cs_b)
@@ -36683,8 +36662,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                  * accumulations */
                 BLIS_SET_YMM_REG_ZEROS
                 /*
-                   Peform GEMM between a01 and b10 blocks
-                   For first itteration there will be no GEMM operation
+                   Perform GEMM between a01 and b10 blocks
+                   For first iteration there will be no GEMM operation
                    where k_iter are zero
                    */
 
@@ -36693,7 +36672,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                 /*
                    Load b11 of size 8x6 and multiply with alpha
                    Add the GEMM output to b11
-                   and peform TRSM operation.
+                   and perform TRSM operation.
                    */
 
                 BLIS_PRE_ZTRSM_SMALL_3x2(AlphaVal,b11,cs_b)
@@ -36831,8 +36810,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                  * accumulations */
                 BLIS_SET_YMM_REG_ZEROS
                 /*
-                   Peform GEMM between a01 and b10 blocks
-                   For first itteration there will be no GEMM operation
+                   Perform GEMM between a01 and b10 blocks
+                   For first iteration there will be no GEMM operation
                    where k_iter are zero
                    */
 
@@ -36841,7 +36820,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
                 /*
                    Load b11 and multiply with alpha
                    Add the GEMM output to b11
-                   and peform TRSM operation.
+                   and perform TRSM operation.
                    */
 
                 BLIS_PRE_ZTRSM_SMALL_3x1(AlphaVal,b11,cs_b)
@@ -37165,8 +37144,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             /*Fill zeros into ymm registers used in gemm accumulations */
             BLIS_SET_YMM_REG_ZEROS
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
             //BLIS_ZTRSM_SMALL_GEMM_3nx3m(a01,b10,cs_b,p_lda,k_iter)
@@ -37254,8 +37233,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             /*Fill zeros into ymm registers used in gemm accumulations */
             BLIS_SET_YMM_REG_ZEROS
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
             BLIS_ZTRSM_SMALL_GEMM_2nx2m(a01,b10,cs_b,p_lda,k_iter)
@@ -37325,8 +37304,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAutB_XAlB
             /*Fill zeros into ymm registers used in gemm accumulations */
             BLIS_SET_YMM_REG_ZEROS
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
             BLIS_ZTRSM_SMALL_GEMM_2nx1m(a01,b10,cs_b,p_lda,k_iter)
@@ -37643,7 +37622,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
     dim_t cs_a, rs_a;
     dim_t d_mr = 4,d_nr = 3;
 
-    // Swap rs_a & cs_a in case of non-tranpose.
+    // Swap rs_a & cs_a in case of non-transpose.
     if(transa)
     {
         cs_a = bli_obj_col_stride(a); // column stride of A
@@ -37664,8 +37643,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
     bool is_unitdiag = bli_obj_has_unit_diag(a);
 
     dcomplex AlphaVal = *(dcomplex *)AlphaObj->buffer;    //value of Alpha
-    dcomplex* restrict L = a->buffer;      //pointer to matrix A
-    dcomplex* restrict B = b->buffer;      //pointer to matrix B
+    dcomplex* restrict L = bli_obj_buffer_at_off(a);      //pointer to matrix A
+    dcomplex* restrict B = bli_obj_buffer_at_off(b);      //pointer to matrix B
 
     dcomplex *a01, *a11, *b10, *b11;   //pointers for GEMM and TRSM blocks
 
@@ -37710,6 +37689,9 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
 
     xmm5 = _mm_setzero_pd();
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm1 = _mm256_setzero_pd();
     for(j = 0; (j+d_nr-1) < n; j += d_nr)     //loop along 'N' direction
     {
         a01 = L + j*rs_a;//pointer to block of A to be used in GEMM
@@ -37733,7 +37715,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
 
             /*
                Pack 3 diagonal elements of A block into an array
-               a. This helps in utilze cache line efficiently in TRSM
+               a. This helps to utilize cache line efficiently in TRSM
                operation
                b. store ones when input is unit diagonal
                */
@@ -37768,8 +37750,8 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
             BLIS_SET_YMM_REG_ZEROS
 
             /*
-               Peform GEMM between a01 and b10 blocks
-               For first itteration there will be no GEMM operation
+               Perform GEMM between a01 and b10 blocks
+               For first iteration there will be no GEMM operation
                where k_iter are zero
                */
 
@@ -37778,7 +37760,7 @@ BLIS_INLINE err_t bli_ztrsm_small_XAltB_XAuB
             /*
                Load b11 of size 4x3 and multiply with alpha
                Add the GEMM output to b11
-               and peform TRSM operation.
+               and perform TRSM operation.
                */
 
             BLIS_PRE_ZTRSM_SMALL_3x4(AlphaVal,b11,cs_b)
@@ -39506,6 +39488,16 @@ BLIS_INLINE void ctrsm_small_pack_diag_element
 	dim_t size
 )
 {
+    if ( is_unitdiag )
+    {
+        scomplex ones = {1.0, 0.0};
+        for( dim_t i = 0; i < size; i++)
+        {
+            d11_pack[i].real = ones.real;
+            d11_pack[i].imag = ones.imag;
+        }
+        return;
+    }
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
 	// If Preinversion is disabled, inverse the diaganol
 	// elements from A and pack into diagonal buffer.
@@ -42230,7 +42222,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 	dim_t cs_a, rs_a;
 	dim_t d_mr = 8,d_nr = 3;
 
-	// Swap rs_a & cs_a in case of non-tranpose.
+	// Swap rs_a & cs_a in case of non-transpose.
 	if(transa)
 	{
 		cs_a = bli_obj_col_stride(a); // column stride of A
@@ -42275,6 +42267,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 	xmm3 = _mm_setzero_ps();
 	xmm4 = _mm_setzero_ps();
 	xmm5 = _mm_setzero_ps();
+	//gcc12 throws a unitialized warning,
+	//To avoid that these variable are set to zero.
+	ymm0= _mm256_setzero_ps();
+	ymm1= _mm256_setzero_ps();
+	ymm2= _mm256_setzero_ps();
     
         gint_t required_packing_A = 1;
 	mem_t local_mem_buf_A_s = {0};
@@ -42307,7 +42304,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 	}
     
     /*
-	   Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+	   Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
 	   a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
 	   First there will be no GEMM and no packing of a10 because it is only TRSM
 	   b. Using packed a10 block and b01 block perform GEMM operation
@@ -42323,11 +42320,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 		if(transa)
 		{
 			/*
-			   Load, tranpose and pack current A block (a10) into packed buffer memory
+			   Load, transpose and pack current A block (a10) into packed buffer memory
 			   D_A_pack
 			   a. This a10 block is used in GEMM portion only and this
-			   a10 block size will be increasing by d_mr for every next itteration
-			   untill it reaches 4x(m-4) which is the maximum GEMM alone block size
+			   a10 block size will be increasing by d_mr for every next iteration
+			   until it reaches 4x(m-4) which is the maximum GEMM alone block size
 			   in A
 			   b. This packed buffer is reused to calculate all n rows of B matrix
 			   */
@@ -42335,7 +42332,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 
 			/*
 			   Pack 4 diagonal elements of A block into an array
-			   a. This helps in utilze cache line efficiently in TRSM operation
+			   a. This helps to utilize cache line efficiently in TRSM operation
 			   b. store ones when input is unit diagonal
 			   */
 			ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -42351,7 +42348,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 		   c. This loop GEMM+TRSM loops operates with 4x3 block size
 		   along n dimension for every d_nr rows of b01 where
 		   packed A buffer is reused in computing all n rows of B.
-		   d. Same approch is used in remaining fringe cases.
+		   d. Same approach is used in remaining fringe cases.
 		   */
 		dim_t temp = n - d_nr + 1;
 		for(j = 0; j < temp; j += d_nr)   //loop along 'N' dimension
@@ -42367,16 +42364,16 @@ BLIS_INLINE err_t bli_ctrsm_small_AutXB_AlXB
 			BLIS_SET_S_YMM_REG_ZEROS
 
 			/*
-			   Peform GEMM between a10 and b01 blocks
-			   For first itteration there will be no GEMM operation
+			   Perform GEMM between a10 and b01 blocks
+			   For first iteration there will be no GEMM operation
 			   where k_iter are zero
 			   */
 			BLIS_CTRSM_SMALL_GEMM_8mx3n(a10,b01,cs_b,p_lda,k_iter)
 
 			/*
 			   Load b11 of size 3x4 and multiply with alpha
-			   Add the GEMM output and perform inregister transose of b11
-			   to peform TRSM operation.
+			   Add the GEMM output and perform in register transpose of b11
+			   to perform TRSM operation.
 			   */
 			BLIS_CTRSM_SMALL_NREG_TRANSPOSE_3x8(b11,cs_b,AlphaVal)
 			/*
@@ -44762,7 +44759,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 	dim_t cs_a, rs_a;
 	dim_t d_mr = 8,d_nr = 3;
 
-	// Swap rs_a & cs_a in case of non-tranpose.
+	// Swap rs_a & cs_a in case of non-transpose.
 	if(transa)
 	{
 		cs_a = bli_obj_col_stride(a); // column stride of A
@@ -44808,6 +44805,12 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 	xmm4 = _mm_setzero_ps();
 	xmm5 = _mm_setzero_ps();
 
+	//gcc12 throws a unitialized warning,
+	//To avoid that these variable are set to zero.
+	ymm0 = _mm256_setzero_ps();
+	ymm1 = _mm256_setzero_ps();
+	ymm2 = _mm256_setzero_ps();
+
 	gint_t required_packing_A = 1;
 	mem_t local_mem_buf_A_s = {0};
 	scomplex *D_A_pack = NULL;
@@ -44839,7 +44842,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 	}
 
 	/*
-	   Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+	   Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
 	   a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
 	   First there will be no GEMM and no packing of a10 because it is only TRSM
 	   b. Using packed a10 block and b01 block perform GEMM operation
@@ -44857,11 +44860,11 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 		if(transa)
 		{
 			/*
-			   Load, tranpose and pack current A block (a10) into packed buffer memory
+			   Load, transpose and pack current A block (a10) into packed buffer memory
 			   D_A_pack
 			   a. This a10 block is used in GEMM portion only and this
-			   a10 block size will be increasing by d_mr for every next itteration
-			   untill it reaches 4x(m-4) which is the maximum GEMM alone block size
+			   a10 block size will be increasing by d_mr for every next iteration
+			   until it reaches 4x(m-4) which is the maximum GEMM alone block size
 			   in A
 			   b. This packed buffer is reused to calculate all n rows of B matrix
 			   */
@@ -44869,7 +44872,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 
 			/*
 			   Pack 4 diagonal elements of A block into an array
-			   a. This helps in utilze cache line efficiently in TRSM operation
+			   a. This helps to utilize cache line efficiently in TRSM operation
 			   b. store ones when input is unit diagonal
 			   */
 			ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_mr);
@@ -44885,7 +44888,7 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 		   c. This loop GEMM+TRSM loops operates with 4x3 block size
 		   along n dimension for every d_nr rows of b01 where
 		   packed A buffer is reused in computing all n rows of B.
-		   d. Same approch is used in remaining fringe cases.
+		   d. Same approach is used in remaining fringe cases.
 		   */
 
 		for(j = (n - d_nr); (j + 1) > 0; j -= d_nr)   //loop along 'N' dimension
@@ -44901,16 +44904,16 @@ BLIS_INLINE err_t bli_ctrsm_small_AltXB_AuXB
 			BLIS_SET_S_YMM_REG_ZEROS
 
 			/*
-			   Peform GEMM between a10 and b01 blocks
-			   For first itteration there will be no GEMM operation
+			   Perform GEMM between a10 and b01 blocks
+			   For first iteration there will be no GEMM operation
 			   where k_iter are zero
 			   */
 			BLIS_CTRSM_SMALL_GEMM_8mx3n(a10,b01,cs_b,p_lda,k_iter)
 
 			/*
 			   Load b11 of size 3x4 and multiply with alpha
-			   Add the GEMM output and perform inregister transose of b11
-			   to peform TRSM operation.
+			   Add the GEMM output and perform in register transpose of b11
+			   to perform TRSM operation.
 			   */
 			BLIS_CTRSM_SMALL_NREG_TRANSPOSE_3x8(b11,cs_b,AlphaVal)
 			/*
@@ -47543,7 +47546,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 	dim_t cs_a, rs_a;
 	dim_t d_mr = 8,d_nr = 3;
 
-	// Swap rs_a & cs_a in case of non-tranpose.
+	// Swap rs_a & cs_a in case of non-transpose.
 	if(transa)
 	{
 		cs_a = bli_obj_col_stride(a); // column stride of A
@@ -47579,12 +47582,15 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 	__m256 ymm16, ymm17, ymm18, ymm19;
 
 	__m128 xmm0, xmm1, xmm2;
-    	__m128 xmm5;
+	__m128 xmm5;
 
         xmm0 = _mm_setzero_ps();
 	xmm1 = _mm_setzero_ps();
 	xmm2 = _mm_setzero_ps();
 	xmm5 = _mm_setzero_ps();
+	//gcc12 throws a unitialized warning,
+	//To avoid that these variable are set to zero.
+	ymm0 = _mm256_setzero_ps();
 
 	gint_t required_packing_A = 1;
 	mem_t local_mem_buf_A_s = {0};
@@ -47617,7 +47623,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 	}
 
 	/*
-	   Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+	   Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
 	   a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
 	   First there will be no GEMM and no packing of a10 because it is only TRSM
 	   b. Using packed a10 block and b01 block perform GEMM operation
@@ -47633,11 +47639,11 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 		if(transa)
 		{
 			/*
-			   Load, tranpose and pack current A block (a10) into packed buffer memory
+			   Load, transpose and pack current A block (a10) into packed buffer memory
 			   D_A_pack
 			   a. This a10 block is used in GEMM portion only and this
-			   a10 block size will be increasing by d_mr for every next itteration
-			   untill it reaches 4x(m-4) which is the maximum GEMM alone block size
+			   a10 block size will be increasing by d_mr for every next iteration
+			   until it reaches 4x(m-4) which is the maximum GEMM alone block size
 			   in A
 			   b. This packed buffer is reused to calculate all n rows of B matrix
 			   */
@@ -47645,7 +47651,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 
 			/*
 			   Pack 4 diagonal elements of A block into an array
-			   a. This helps in utilze cache line efficiently in TRSM operation
+			   a. This helps to utilize cache line efficiently in TRSM operation
 			   b. store ones when input is unit diagonal
 			   */
 			ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
@@ -47661,7 +47667,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 		   c. This loop GEMM+TRSM loops operates with 4x3 block size
 		   along n dimension for every d_nr rows of b01 where
 		   packed A buffer is reused in computing all n rows of B.
-		   d. Same approch is used in remaining fringe cases.
+		   d. Same approach is used in remaining fringe cases.
 		   */
 		for(i = (m-d_mr); (i+1) > 0; i -= d_mr)     //loop along 'M' direction
 		{
@@ -47677,16 +47683,16 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAutB_XAlB
 			BLIS_SET_S_YMM_REG_ZEROS
 
 			/*
-			   Peform GEMM between a10 and b01 blocks
-			   For first itteration there will be no GEMM operation
+			   Perform GEMM between a10 and b01 blocks
+			   For first iteration there will be no GEMM operation
 			   where k_iter are zero
 			   */
 			BLIS_CTRSM_SMALL_GEMM_3nx8m(a01,b10,cs_b,p_lda,k_iter)
 
 			/*
 			   Load b11 of size 3x4 and multiply with alpha
-			   Add the GEMM output and perform inregister transose of b11
-			   to peform TRSM operation.
+			   Add the GEMM output and perform in register transpose of b11
+			   to perform TRSM operation.
 			   */
 			BLIS_PRE_CTRSM_SMALL_3x8(AlphaVal, b11, cs_b)
 			/*
@@ -49163,7 +49169,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 	dim_t cs_a, rs_a;
 	dim_t d_mr = 8,d_nr = 3;
 
-	// Swap rs_a & cs_a in case of non-tranpose.
+	// Swap rs_a & cs_a in case of non-transpose.
 	if(transa)
 	{
 		cs_a = bli_obj_col_stride(a); // column stride of A
@@ -49205,6 +49211,10 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 	xmm2 = _mm_setzero_ps();
 	xmm5 = _mm_setzero_ps();
 
+	//gcc12 throws a unitialized warning,
+	//To avoid that these variable are set to zero.
+	ymm0 = _mm256_setzero_ps();
+
 	gint_t required_packing_A = 1;
 	mem_t local_mem_buf_A_s = {0};
 	scomplex *D_A_pack = NULL;
@@ -49236,7 +49246,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 	}
 
 	/*
-	   Performs solving TRSM for 4 colmns at a time from  0 to m/4 in steps of d_mr
+	   Performs solving TRSM for 4 columns at a time from  0 to m/4 in steps of d_mr
 	   a. Load, transpose, Pack A (a10 block), the size of packing 4x3 to 4x (m-4)
 	   First there will be no GEMM and no packing of a10 because it is only TRSM
 	   b. Using packed a10 block and b01 block perform GEMM operation
@@ -49253,11 +49263,11 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 		if(transa)
 		{
 			/*
-			   Load, tranpose and pack current A block (a10) into packed buffer memory
+			   Load, transpose and pack current A block (a10) into packed buffer memory
 			   D_A_pack
 			   a. This a10 block is used in GEMM portion only and this
-			   a10 block size will be increasing by d_mr for every next itteration
-			   untill it reaches 4x(m-4) which is the maximum GEMM alone block size
+			   a10 block size will be increasing by d_mr for every next iteration
+			   until it reaches 4x(m-4) which is the maximum GEMM alone block size
 			   in A
 			   b. This packed buffer is reused to calculate all n rows of B matrix
 			   */
@@ -49265,7 +49275,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 
 			/*
 			   Pack 4 diagonal elements of A block into an array
-			   a. This helps in utilze cache line efficiently in TRSM operation
+			   a. This helps to utilize cache line efficiently in TRSM operation
 			   b. store ones when input is unit diagonal
 			   */
 			ctrsm_small_pack_diag_element(is_unitdiag,a11,cs_a,d11_pack,d_nr);
@@ -49281,7 +49291,7 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 		   c. This loop GEMM+TRSM loops operates with 4x3 block size
 		   along n dimension for every d_nr rows of b01 where
 		   packed A buffer is reused in computing all n rows of B.
-		   d. Same approch is used in remaining fringe cases.
+		   d. Same approach is used in remaining fringe cases.
 		   */
 		for(i = 0; (i+d_mr-1) < m; i += d_mr)     //loop along 'M' direction
 		{
@@ -49297,16 +49307,16 @@ BLIS_INLINE  err_t bli_ctrsm_small_XAltB_XAuB
 			BLIS_SET_S_YMM_REG_ZEROS
 
 			/*
-			   Peform GEMM between a10 and b01 blocks
-			   For first itteration there will be no GEMM operation
+			   Perform GEMM between a10 and b01 blocks
+			   For first iteration there will be no GEMM operation
 			   where k_iter are zero
 			   */
 			BLIS_CTRSM_SMALL_GEMM_3nx8m(a01,b10,cs_b,p_lda,k_iter)
 
 			/*
 			   Load b11 of size 3x4 and multiply with alpha
-			   Add the GEMM output and perform inregister transose of b11
-			   to peform TRSM operation.
+			   Add the GEMM output and perform in register transpose of b11
+			   to perform TRSM operation.
 			   */
 			BLIS_PRE_CTRSM_SMALL_3x8(AlphaVal, b11, cs_b)
 			/*
diff --git a/kernels/zen/3/bli_zgemm_ref_k1.c b/kernels/zen/3/bli_zgemm_avx2_k1.c
similarity index 99%
rename from kernels/zen/3/bli_zgemm_ref_k1.c
rename to kernels/zen/3/bli_zgemm_avx2_k1.c
index 47de706238..a6a92f9a54 100644
--- a/kernels/zen/3/bli_zgemm_ref_k1.c
+++ b/kernels/zen/3/bli_zgemm_avx2_k1.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -107,7 +107,7 @@
     NEG_PERM_M_FRINGE(rin_0,rn);    \
     rout_0 = _mm256_fmadd_pd(rbc, rin_0, rout_0);    \
 
-void bli_zgemm_ref_k1_nn
+void bli_zgemm_4x6_avx2_k1_nn
 (
     dim_t  m,
     dim_t  n,
@@ -155,6 +155,9 @@ void bli_zgemm_ref_k1_nn
     __m256d ymm12, ymm13, ymm14, ymm15;
     __m128d xmm5;
 
+    //gcc12 throws a unitialized warning,
+    //To avoid that these variable are set to zero.
+    ymm0 = _mm256_setzero_pd();
     /* Form C = alpha*A*B + beta*c */
     // Main loop along N dimension
     for(dim_t j = 0;j < (n-Z_NR+1);j=j+Z_NR)
@@ -1823,4 +1826,4 @@ void bli_zgemm_ref_k1_nn
 
     }
 
-}
\ No newline at end of file
+}
diff --git a/kernels/zen/3/sup/CMakeLists.txt b/kernels/zen/3/sup/CMakeLists.txt
index db764f8b0a..57f3ee01ff 100644
--- a/kernels/zen/3/sup/CMakeLists.txt
+++ b/kernels/zen/3/sup/CMakeLists.txt
@@ -1,10 +1,13 @@
-##Copyright (C) 2020, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}" 
-     PRIVATE
+add_library(zen_3_sup
+     OBJECT
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16m.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_s6x16n.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_z3x4.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_z3x4m.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_asm_z3x4n.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_c3x8.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_c3x8m.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_c3x8n.c
@@ -15,3 +18,7 @@ ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4m.c
 ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_asm_z3x4n.c
     )
+target_compile_options(zen_3_sup PRIVATE /arch:AVX2)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen_3_sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c
index c309c8c0cd..3c47a910bb 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16.c
@@ -2,8 +2,10 @@
    BLIS
    An object-based framework for developing high-performance BLAS-like
    libraries.
+
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2022 , Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
    met:
@@ -15,6 +17,7 @@
     - Neither the name(s) of the copyright holder(s) nor the names of its
       contributors may be used to endorse or promote products derived
       from this software without specific prior written permission.
+
    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
@@ -27,7 +30,9 @@
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
+
 #include "blis.h"
+
 #define BLIS_ASM_SYNTAX_ATT
 #include "bli_x86_asm_macros.h"
 /*
@@ -328,6 +333,9 @@ void bli_sgemmsup_rd_zen_asm_2x16
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm7", "ymm8",
+      "ymm10", "ymm11", "ymm13", "ymm14",
       "memory"
     )
 }
@@ -560,6 +568,8 @@ void bli_sgemmsup_rd_zen_asm_1x16
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm2", "ymm3", "ymm4",
+      "ymm7", "ymm10", "ymm13",
       "memory"
     )
 }
@@ -858,6 +868,9 @@ void bli_sgemmsup_rd_zen_asm_2x8
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm7", "ymm8",
+      "ymm10", "ymm11", "ymm13", "ymm14",
       "memory"
     )
 }
@@ -1088,6 +1101,8 @@ void bli_sgemmsup_rd_zen_asm_1x8
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm2", "ymm3", "ymm4",
+      "ymm7", "ymm10", "ymm13",
       "memory"
     )
 }
@@ -1354,6 +1369,9 @@ void bli_sgemmsup_rd_zen_asm_2x4
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm7", "ymm8",
+      "ymm10", "ymm11", "ymm13", "ymm14",
       "memory"
     )
 }
@@ -1568,6 +1586,8 @@ void bli_sgemmsup_rd_zen_asm_1x4
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm2", "ymm3", "ymm4",
+      "ymm7", "ymm10", "ymm13",
       "memory"
     )
 }
@@ -1792,6 +1812,8 @@ void bli_sgemmsup_rd_zen_asm_2x2
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm3", "ymm4",
+      "ymm5", "ymm6", "ymm7",
       "memory"
     )
 }
@@ -1979,6 +2001,8 @@ void bli_sgemmsup_rd_zen_asm_1x2
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm3", "ymm4",
+      "ymm5",
       "memory"
     )
 }
@@ -2370,6 +2394,10 @@ void bli_sgemmsup_rd_zen_asm_6x2
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm3", "ymm4",
+      "ymm5", "ymm6", "ymm7", "ymm8",
+      "ymm9", "ymm10", "ymm11", "ymm12",
+      "ymm13", "ymm14", "ymm15",
       "memory"
     )
     consider_edge_cases:
@@ -2664,6 +2692,9 @@ void bli_sgemmsup_rd_zen_asm_3x2
       "xmm4", "xmm5", "xmm6", "xmm7",
       "xmm8", "xmm9", "xmm10", "xmm11",
       "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm3", "ymm4",
+      "ymm5", "ymm6", "ymm7", "ymm8",
+      "ymm9",
       "memory"
     )
 }
diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c
index 4eebb2b0a5..6d1d001b50 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16m.c
@@ -1,1965 +1,1981 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-
-   NOTE: These kernels implicitly support column-oriented IO, implemented
-   via an a high-level transposition of the entire operation. A and B will
-   effectively remain row- and column-stored, respectively, but C will then
-   effectively appear column-stored. Thus, this kernel may be used for both
-   rrc and crc cases.
-*/
-
-// Prototype reference microkernels.
-
-void bli_sgemmsup_rd_zen_asm_6x16m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-
-    uint64_t n_left = n0 % 16;
-
-    // First check whether this is a edge case in the n dimension. If so,
-    // dispatch other 6x?m kernels, as needed.
-    if ( n_left )
-    {
-        float* restrict cij = c;
-        float* restrict bj  = b;
-        float* restrict ai  = a;
-
-        if ( 8 <= n_left )
-        {
-            const dim_t nr_cur = 8;
-
-            bli_sgemmsup_rd_zen_asm_6x8m
-            (
-              conja, conjb, m0, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 4 <= n_left )
-        {
-            const dim_t nr_cur = 4;
-
-            bli_sgemmsup_rd_zen_asm_6x4m
-            (
-              conja, conjb, m0, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_6x2m
-            (
-              conja, conjb, m0, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, m0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-        return;
-    }
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-
-    begin_asm()
-
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), rdx)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-
-    // r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r15)                   // jj = 0;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-    mov(var(a), r14)                   // load address of a
-    mov(var(c), r12)                   // load address of c
-    mov(var(b), rdx)
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(r11, rsi)                     // rsi *= cs_b;
-    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-    mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(r14), rax)                 // rax = a_ii;
-    lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 1
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 3
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15 
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/*
+   rrc:
+     --------        ------        | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------   +=   ------ ...    | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------        ------              :
+     --------        ------              :
+
+   Assumptions:
+   - C is row-stored and B is column-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential microkernel is well-suited for
+   a dot-product-based accumulation that performs vector loads from
+   both A and B.
+
+   NOTE: These kernels implicitly support column-oriented IO, implemented
+   via an a high-level transposition of the entire operation. A and B will
+   effectively remain row- and column-stored, respectively, but C will then
+   effectively appear column-stored. Thus, this kernel may be used for both
+   rrc and crc cases.
+*/
+
+// Prototype reference microkernels.
+
+void bli_sgemmsup_rd_zen_asm_6x16m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+
+    uint64_t n_left = n0 % 16;
+
+    // First check whether this is a edge case in the n dimension. If so,
+    // dispatch other 6x?m kernels, as needed.
+    if ( n_left )
+    {
+        float* restrict cij = c;
+        float* restrict bj  = b;
+        float* restrict ai  = a;
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+
+            bli_sgemmsup_rd_zen_asm_6x8m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+
+            bli_sgemmsup_rd_zen_asm_6x4m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_6x2m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, m0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+        return;
+    }
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+
+    begin_asm()
+
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), rdx)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
+
+    // r12 = rcx = c
+    // r14 = rax = a
+    // rdx = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r15)                   // jj = 0;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
+
+
+
+    mov(var(a), r14)                   // load address of a
+    mov(var(c), r12)                   // load address of c
+    mov(var(b), rdx)
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(r11, rsi)                     // rsi *= cs_b;
+    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
+
+
+
+    mov(var(m_iter), r9)               // ii = m_iter;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(r14), rax)                 // rax = a_ii;
+    lea(mem(rdx), rbx)                 // rbx = b_jj;
+
+    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 1
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 3
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15 
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
     vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                           // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    lea(mem(r12, rdi, 2), r12)         //
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)         //
-    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
-
-    add(imm(4), r15)                   // jj += 4;
-    cmp(imm(16), r15)                   // compare jj to 4
-    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
-                                       // of jj loop; otherwise, loop ends.
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 16;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        float* restrict cij = c + i_edge*rs_c;
-        float* restrict bj  = b;
-        float* restrict ai  = a + i_edge*rs_a;
-
-        if ( 2 == m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x16
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            const dim_t mr_cur = 1;
-
-            bli_sgemmsup_rd_zen_asm_1x16
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_6x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-
-    begin_asm()
-
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), rdx)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-
-    // r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r15)                   // jj = 0;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-    mov(var(a), r14)                   // load address of a
-    mov(var(c), r12)                   // load address of c
-    mov(var(b), rdx)
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(r11, rsi)                     // rsi *= cs_b;
-    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-    mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(r14), rax)                 // rax = a_ii;
-    lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 1
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 3
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15 
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
     vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                           // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    lea(mem(r12, rdi, 2), r12)         //
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)         //
-    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
-
-    add(imm(4), r15)                   // jj += 4;
-    cmp(imm(8), r15)                   // compare jj to 4
-    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
-                                       // of jj loop; otherwise, loop ends.
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 8;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        float* restrict cij = c + i_edge*rs_c;
-        float* restrict bj  = b;
-        float* restrict ai  = a + i_edge*rs_a;
-
-        if ( 2 == m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x8
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            const dim_t mr_cur = 1;
-
-            bli_sgemmsup_rd_zen_asm_1x8
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-        }
-    }
-}
-
-
-
-void bli_sgemmsup_rd_zen_asm_6x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-
-    begin_asm()
-
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), rdx)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-
-    // r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r15)                   // jj = 0;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-    mov(var(a), r14)                   // load address of a
-    mov(var(c), r12)                   // load address of c
-    mov(var(b), rdx)
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(r11, rsi)                     // rsi *= cs_b;
-    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-    mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(r14), rax)                 // rax = a_ii;
-    lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 1
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 3
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15 
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+                                           // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    lea(mem(r12, rdi, 2), r12)         //
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)         //
+    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
+
+    dec(r9)                            // ii -= 1;
+    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
+
+    add(imm(4), r15)                   // jj += 4;
+    cmp(imm(16), r15)                   // compare jj to 4
+    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
+                                       // of jj loop; otherwise, loop ends.
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 16;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x16
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x16
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_6x8m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+
+    begin_asm()
+
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), rdx)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
+
+    // r12 = rcx = c
+    // r14 = rax = a
+    // rdx = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r15)                   // jj = 0;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
+
+
+
+    mov(var(a), r14)                   // load address of a
+    mov(var(c), r12)                   // load address of c
+    mov(var(b), rdx)
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(r11, rsi)                     // rsi *= cs_b;
+    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
+
+
+
+    mov(var(m_iter), r9)               // ii = m_iter;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(r14), rax)                 // rax = a_ii;
+    lea(mem(rdx), rbx)                 // rbx = b_jj;
+
+    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 1
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 3
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15 
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
     vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                           // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    lea(mem(r12, rdi, 2), r12)         //
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)         //
-    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
-
-    add(imm(4), r15)                   // jj += 4;
-    cmp(imm(4), r15)                   // compare jj to 4
-    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
-                                       // of jj loop; otherwise, loop ends.
-    label(.SRETURN)
-
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 4;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        float* restrict cij = c + i_edge*rs_c;
-        float* restrict bj  = b;
-        float* restrict ai  = a + i_edge*rs_a;
-
-        if ( 2 == m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x4
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            const dim_t mr_cur = 1;
-
-            bli_sgemmsup_rd_zen_asm_1x4
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_6x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
- 
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), rdx)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-
-    // r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r15)                   // jj = 0;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-    mov(var(a), r14)                   // load address of a
-    mov(var(c), r12)                   // load address of c
-    mov(var(b), rdx)
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(r11, rsi)                     // rsi *= cs_b;
-    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-    mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(r14), rax)                 // rax = a_ii;
-    lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-
-    // ---------------------------------- iteration 1
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    // ---------------------------------- iteration 3
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  
-                                       // ymm5  ymm8 
-                                       // ymm6  ymm9 
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-    vhaddps(xmm0,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
     vaddps( xmm0, xmm1, xmm0 )
-    vhaddps(xmm0,xmm0,xmm5)
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-    vhaddps(xmm0,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7)
-                                       // ymm5 = sum(ymm5) sum(ymm8)
-                                       // ymm6 = sum(ymm6) sum(ymm9)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                           // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vmovsd(mem(rcx), xmm0)////a0a1
-    vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1)
-    vmovsd(xmm4, mem(rcx))//a0a1
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+                                           // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
     add(rdi, rcx)
-    vmovsd(mem(rcx), xmm0)
-    vfmadd231ps(xmm0, xmm3, xmm5)
-    vmovsd(xmm5, mem(rcx))
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
     add(rdi, rcx)
-    vmovsd(mem(rcx), xmm0)
-    vfmadd231ps(xmm0, xmm3, xmm6)
-    vmovsd(xmm6, mem(rcx))
 
-    jmp(.SDONE)                        // jump to end.
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+    jmp(.SDONE)                        // jump to end.
 
     label(.SBETAZERO)
-    label(.SROWSTORBZ)
 
-    vmovsd(xmm4, mem(rcx))
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
     add(rdi, rcx)
-    vmovsd(xmm5, mem(rcx))
+
+    vmovups(xmm5, mem(rcx))
     add(rdi, rcx)
-    vmovsd(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    lea(mem(r12, rdi, 2), r12)         //
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)         //
-    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
-
-    add(imm(4), r15)                   // jj += 4;
-    cmp(imm(4), r15)                   // compare jj to 4
-    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
-                                       // of jj loop; otherwise, loop ends.
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 2;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        float* restrict cij = c + i_edge*rs_c;
-        float* restrict bj  = b;
-        float* restrict ai  = a + i_edge*rs_a;
-
-        if ( 2 == m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            const dim_t mr_cur = 1;
-
-            bli_sgemmsup_rd_zen_asm_1x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-        }
-    }
-}
-
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    lea(mem(r12, rdi, 2), r12)         //
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)         //
+    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
+
+    dec(r9)                            // ii -= 1;
+    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
+
+    add(imm(4), r15)                   // jj += 4;
+    cmp(imm(8), r15)                   // compare jj to 4
+    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
+                                       // of jj loop; otherwise, loop ends.
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 8;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x8
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x8
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
+
+
+void bli_sgemmsup_rd_zen_asm_6x4m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+
+    begin_asm()
+
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), rdx)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
+
+    // r12 = rcx = c
+    // r14 = rax = a
+    // rdx = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r15)                   // jj = 0;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
+
+
+
+    mov(var(a), r14)                   // load address of a
+    mov(var(c), r12)                   // load address of c
+    mov(var(b), rdx)
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(r11, rsi)                     // rsi *= cs_b;
+    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
+
+
+
+    mov(var(m_iter), r9)               // ii = m_iter;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(r14), rax)                 // rax = a_ii;
+    lea(mem(rdx), rbx)                 // rbx = b_jj;
+
+    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 1
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 3
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15 
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+                                           // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    lea(mem(r12, rdi, 2), r12)         //
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)         //
+    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
+
+    dec(r9)                            // ii -= 1;
+    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
+
+    add(imm(4), r15)                   // jj += 4;
+    cmp(imm(4), r15)                   // compare jj to 4
+    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
+                                       // of jj loop; otherwise, loop ends.
+    label(.SRETURN)
+
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 4;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x4
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x4
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_6x2m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+ 
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), rdx)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
+
+    // r12 = rcx = c
+    // r14 = rax = a
+    // rdx = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r15)                   // jj = 0;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
+
+
+
+    mov(var(a), r14)                   // load address of a
+    mov(var(c), r12)                   // load address of c
+    mov(var(b), rdx)
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(r11, rsi)                     // rsi *= cs_b;
+    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
+
+
+
+    mov(var(m_iter), r9)               // ii = m_iter;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(r14), rax)                 // rax = a_ii;
+    lea(mem(rdx), rbx)                 // rbx = b_jj;
+
+    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+
+    // ---------------------------------- iteration 1
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    // ---------------------------------- iteration 3
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  
+                                       // ymm5  ymm8 
+                                       // ymm6  ymm9 
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+    vhaddps(xmm0,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+    vhaddps(xmm0,xmm0,xmm5)
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+    vhaddps(xmm0,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7)
+                                       // ymm5 = sum(ymm5) sum(ymm8)
+                                       // ymm6 = sum(ymm6) sum(ymm9)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+                                           // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vmovsd(mem(rcx), xmm0)////a0a1
+    vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1)
+    vmovsd(xmm4, mem(rcx))//a0a1
+    add(rdi, rcx)
+    vmovsd(mem(rcx), xmm0)
+    vfmadd231ps(xmm0, xmm3, xmm5)
+    vmovsd(xmm5, mem(rcx))
+    add(rdi, rcx)
+    vmovsd(mem(rcx), xmm0)
+    vfmadd231ps(xmm0, xmm3, xmm6)
+    vmovsd(xmm6, mem(rcx))
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+    label(.SROWSTORBZ)
+
+    vmovsd(xmm4, mem(rcx))
+    add(rdi, rcx)
+    vmovsd(xmm5, mem(rcx))
+    add(rdi, rcx)
+    vmovsd(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    lea(mem(r12, rdi, 2), r12)         //
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)         //
+    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
+
+    dec(r9)                            // ii -= 1;
+    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
+
+    add(imm(4), r15)                   // jj += 4;
+    cmp(imm(4), r15)                   // compare jj to 4
+    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
+                                       // of jj loop; otherwise, loop ends.
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 2;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c
index 7f0c856130..6b84594e39 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_s6x16n.c
@@ -1,1869 +1,1883 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-
-   NOTE: These kernels implicitly support column-oriented IO, implemented
-   via an a high-level transposition of the entire operation. A and B will
-   effectively remain row- and column-stored, respectively, but C will then
-   effectively appear column-stored. Thus, this kernel may be used for both
-   rrc and crc cases.
-*/
-
-void bli_sgemmsup_rd_zen_asm_6x16n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/*
+   rrc:
+     --------        ------        | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------   +=   ------ ...    | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------        ------              :
+     --------        ------              :
+
+   Assumptions:
+   - C is row-stored and B is column-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential microkernel is well-suited for
+   a dot-product-based accumulation that performs vector loads from
+   both A and B.
+
+   NOTE: These kernels implicitly support column-oriented IO, implemented
+   via an a high-level transposition of the entire operation. A and B will
+   effectively remain row- and column-stored, respectively, but C will then
+   effectively appear column-stored. Thus, this kernel may be used for both
+   rrc and crc cases.
+*/
+
+void bli_sgemmsup_rd_zen_asm_6x16n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
 {
-    uint64_t m_left = m0 % 6;
-
-    // First check whether this is a edge case in the n dimension. If so,
-    // dispatch other ?x8m kernels, as needed.
-    if ( m_left )
-    {
-        float* restrict cij = c;
-        float* restrict bj  = b;
-        float* restrict ai  = a;
-
-        // We add special handling for slightly inflated MR blocksizes
-        // at edge cases, up to a maximum of 9.
-        if ( 6 < m0 )
-        {
-            sgemmsup_ker_ft ker_fp1 = NULL;
-            sgemmsup_ker_ft ker_fp2 = NULL;
-            dim_t           mr1, mr2;
-
-            if ( m0 == 7 )
-            {
-                mr1 = 6; mr2 = 1;
-                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
-                ker_fp2 = bli_sgemmsup_rd_zen_asm_1x16n;
-            }
-            else if ( m0 == 8 )
-            {
-                mr1 = 6; mr2 = 2;
-                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
-                ker_fp2 = bli_sgemmsup_rd_zen_asm_2x16n;
-            }
-            else // if ( m0 == 9 )
-            {
-                mr1 = 6; mr2 = 3;
-                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
-                ker_fp2 = bli_sgemmsup_rd_zen_asm_3x16n;
-            }
-
-            ker_fp1
-            (
-              conja, conjb, mr1, n0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += mr1*rs_c0; ai += mr1*rs_a0;
-
-            ker_fp2
-            (
-              conja, conjb, mr2, n0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-
-            return;
-        }
-
-        if ( 3 <= m_left )
-        {
-            const dim_t mr_cur = 3;
-
-            bli_sgemmsup_rd_zen_asm_3x16n
-            (
-              conja, conjb, mr_cur, n0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 2 <= m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x16n
-            (
-              conja, conjb, mr_cur, n0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_TRANSPOSE, conja, k0, n0,
-              alpha, bj, rs_b0, cs_b0, ai, cs_a0,
-              beta, cij, cs_c0, cntx, NULL
-            );
-        }
-        return;
-    }
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t n_iter = n0 / 4;
-    uint64_t n_left = n0 % 4;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( n_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(a), rdx)                   // load address of a.
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-
-    // r12 = rcx = c
-    // rdx = rax = a
-    // r14 = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r9)                    // ii = 0;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ 0 1 ... ]
-
-    mov(var(b), r14)                   // load address of b
-    mov(var(c), r12)                   // load address of c
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-    imul(rdi, rsi)                     // rsi *= rs_c
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 3*ii*rs_c;
-
-    lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-    imul(r8,  rsi)                     // rsi *= rs_a;
-    lea(mem(rdx, rsi, 1), rdx)         // rax = a + 3*ii*rs_a;
-
-    mov(var(n_iter), r15)              // jj = n_iter;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(rdx), rax)                 // rax = a_ii;
-    lea(mem(r14), rbx)                 // rbx = b_jj;
-
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-    prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-
-    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 1
-    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 3
-    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                       // now avoid loading C if beta == 0
-
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
-
-    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-    dec(r15)                           // jj -= 1;
-    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
-
-    add(imm(3), r9)                    // ii += 3;
-    cmp(imm(3), r9)                    // compare ii to 3
-    jle(.SLOOP3X4I)                    // if ii <= 3, jump to beginning
-                                       // of ii loop; otherwise, loop ends.
-
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( n_left )
-    {
-        const dim_t      mr_cur = 6;
-        const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-        float* restrict cij = c + j_edge*cs_c;
-        float* restrict ai  = a;
-        float* restrict bj  = b + j_edge*cs_b;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_6x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_3x16n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t n_iter = n0 / 4;
-    uint64_t n_left = n0 % 4;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( n_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(a), rdx)                   // load address of a.
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-
-    mov(var(b), r14)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-
-    mov(var(c), r12)                   // load address of c
-
-    // r12 = rcx = c
-    // rdx = rax = a
-    // r14 = rbx = b
-    // r9  = unused
-    // r15 = n dim index jj
-
-    mov(var(n_iter), r15)              // jj = n_iter;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(rdx), rax)                 // rax = a_ii;
-    lea(mem(r14), rbx)                 // rbx = b_jj;
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-    prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-
-    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 1
-    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 3
-    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
+    uint64_t m_left = m0 % 6;
+
+    // First check whether this is a edge case in the n dimension. If so,
+    // dispatch other ?x8m kernels, as needed.
+    if ( m_left )
+    {
+        float* restrict cij = c;
+        float* restrict bj  = b;
+        float* restrict ai  = a;
+
+        // We add special handling for slightly inflated MR blocksizes
+        // at edge cases, up to a maximum of 9.
+        if ( 6 < m0 )
+        {
+            sgemmsup_ker_ft ker_fp1 = NULL;
+            sgemmsup_ker_ft ker_fp2 = NULL;
+            dim_t           mr1, mr2;
+
+            if ( m0 == 7 )
+            {
+                mr1 = 6; mr2 = 1;
+                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
+                ker_fp2 = bli_sgemmsup_rd_zen_asm_1x16n;
+            }
+            else if ( m0 == 8 )
+            {
+                mr1 = 6; mr2 = 2;
+                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
+                ker_fp2 = bli_sgemmsup_rd_zen_asm_2x16n;
+            }
+            else // if ( m0 == 9 )
+            {
+                mr1 = 6; mr2 = 3;
+                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
+                ker_fp2 = bli_sgemmsup_rd_zen_asm_3x16n;
+            }
+
+            ker_fp1
+            (
+              conja, conjb, mr1, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += mr1*rs_c0; ai += mr1*rs_a0;
+
+            ker_fp2
+            (
+              conja, conjb, mr2, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+
+            return;
+        }
+
+        if ( 3 <= m_left )
+        {
+            const dim_t mr_cur = 3;
+
+            bli_sgemmsup_rd_zen_asm_3x16n
+            (
+              conja, conjb, mr_cur, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 2 <= m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x16n
+            (
+              conja, conjb, mr_cur, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_TRANSPOSE, conja, k0, n0,
+              alpha, bj, rs_b0, cs_b0, ai, cs_a0,
+              beta, cij, cs_c0, cntx, NULL
+            );
+        }
+        return;
+    }
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rdx)                   // load address of a.
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+
+    // r12 = rcx = c
+    // rdx = rax = a
+    // r14 = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r9)                    // ii = 0;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ 0 1 ... ]
+
+    mov(var(b), r14)                   // load address of b
+    mov(var(c), r12)                   // load address of c
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
+    imul(rdi, rsi)                     // rsi *= rs_c
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 3*ii*rs_c;
+
+    lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
+    imul(r8,  rsi)                     // rsi *= rs_a;
+    lea(mem(rdx, rsi, 1), rdx)         // rax = a + 3*ii*rs_a;
+
+    mov(var(n_iter), r15)              // jj = n_iter;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(rdx), rax)                 // rax = a_ii;
+    lea(mem(r14), rbx)                 // rbx = b_jj;
+
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
+    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+    prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+
+    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
+    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
+    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 1
+    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
+    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 3
+    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
+    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
                                        // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-
-
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
-
-    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-    dec(r15)                           // jj -= 1;
-    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
-
-    label(.SRETURN)
-
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( n_left )
-    {
-        const dim_t      mr_cur = 3;
-        const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-        float* restrict cij = c + j_edge*cs_c;
-        float* restrict ai  = a;
-        float* restrict bj  = b + j_edge*cs_b;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_3x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_2x16n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t n_iter = n0 / 4;
-    uint64_t n_left = n0 % 4;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( n_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(a), rdx)                   // load address of a.
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), r14)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-
-    mov(var(c), r12)                   // load address of c
-
-    // r12 = rcx = c
-    // rdx = rax = a
-    // r14 = rbx = b
-    // r9  = unused
-    // r15 = n dim index jj
-
-    mov(var(n_iter), r15)              // jj = n_iter;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(rdx), rax)                 // rax = a_ii;
-    lea(mem(r14), rbx)                 // rbx = b_jj;
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-
-    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    // ---------------------------------- iteration 1
-    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    // ---------------------------------- iteration 3
-    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-
-                                       // now avoid loading C if beta == 0
-
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-
-    label(.SDONE)
-
-    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
-
-    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-    dec(r15)                           // jj -= 1;
-    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
-
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( n_left )
-    {
-        const dim_t      mr_cur = 2;
-        const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-        float* restrict cij = c + j_edge*cs_c;
-        float* restrict ai  = a;
-        float* restrict bj  = b + j_edge*cs_b;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_1x16n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t n_iter = n0 / 4;
-    uint64_t n_left = n0 % 4;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( n_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(a), rdx)                   // load address of a.
-
-    mov(var(b), r14)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
+
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
+
+    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
+
+    dec(r15)                           // jj -= 1;
+    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
+
+    add(imm(3), r9)                    // ii += 3;
+    cmp(imm(3), r9)                    // compare ii to 3
+    jle(.SLOOP3X4I)                    // if ii <= 3, jump to beginning
+                                       // of ii loop; otherwise, loop ends.
+
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [n_iter] "m" (n_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 6;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_6x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_3x16n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rdx)                   // load address of a.
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+
+    mov(var(b), r14)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
     lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-
-    mov(var(c), r12)                   // load address of c
-
-    // r12 = rcx = c
-    // rdx = rax = a
-    // r14 = rbx = b
-    // r9  = unused
-    // r15 = n dim index jj
-
-    mov(var(n_iter), r15)              // jj = n_iter;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm13, ymm13, ymm13)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(rdx), rax)                 // rax = a_ii;
-    lea(mem(r14), rbx)                 // rbx = b_jj;
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-
-    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
+
+    mov(var(c), r12)                   // load address of c
+
+    // r12 = rcx = c
+    // rdx = rax = a
+    // r14 = rbx = b
+    // r9  = unused
+    // r15 = n dim index jj
+
+    mov(var(n_iter), r15)              // jj = n_iter;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(rdx), rax)                 // rax = a_ii;
+    lea(mem(r14), rbx)                 // rbx = b_jj;
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
+    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+    prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+
+    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
     lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    // ---------------------------------- iteration 1
-    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    // ---------------------------------- iteration 3
-    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
+    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 1
+    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
+    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 3
+    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
+    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15
+
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
                                        // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-
-    label(.SDONE)
-
-    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
-
-    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-    dec(r15)                           // jj -= 1;
-    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
-
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( n_left )
-    {
-        const dim_t      mr_cur = 1;
-        const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-        float* restrict cij = c + j_edge*cs_c;
-        float* restrict ai  = a;
-        float* restrict bj  = b + j_edge*cs_b;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_1x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sdotxv_ex
-            (
-              conja, conjb, k0,
-              alpha, ai, cs_a0, bj, rs_b0,
-              beta, cij, cntx, NULL
-            );
-        }
-    }
-}
-
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
+
+    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
+
+    dec(r15)                           // jj -= 1;
+    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
+
+    label(.SRETURN)
+
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [n_iter] "m" (n_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 3;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_3x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_2x16n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rdx)                   // load address of a.
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), r14)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+
+    mov(var(c), r12)                   // load address of c
+
+    // r12 = rcx = c
+    // rdx = rax = a
+    // r14 = rbx = b
+    // r9  = unused
+    // r15 = n dim index jj
+
+    mov(var(n_iter), r15)              // jj = n_iter;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(rdx), rax)                 // rax = a_ii;
+    lea(mem(r14), rbx)                 // rbx = b_jj;
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
+    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+
+    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
+    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
+    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    // ---------------------------------- iteration 1
+    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
+    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    // ---------------------------------- iteration 3
+    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
+    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15
+
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+
+                                       // now avoid loading C if beta == 0
+
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+
+    label(.SDONE)
+
+    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
+
+    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
+
+    dec(r15)                           // jj -= 1;
+    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
+
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [n_iter] "m" (n_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm7", "ymm8",
+      "ymm10", "ymm11", "ymm13", "ymm14",
+      "ymm15",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 2;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_1x16n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rdx)                   // load address of a.
+
+    mov(var(b), r14)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+
+    mov(var(c), r12)                   // load address of c
+
+    // r12 = rcx = c
+    // rdx = rax = a
+    // r14 = rbx = b
+    // r9  = unused
+    // r15 = n dim index jj
+
+    mov(var(n_iter), r15)              // jj = n_iter;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm13, ymm13, ymm13)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(rdx), rax)                 // rax = a_ii;
+    lea(mem(r14), rbx)                 // rbx = b_jj;
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
+    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+
+    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
+    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
+
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
+    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    // ---------------------------------- iteration 1
+    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
+    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    // ---------------------------------- iteration 3
+    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
+    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15
+
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+
+                                       // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+
+    label(.SDONE)
+
+    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
+
+    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
+
+    dec(r15)                           // jj -= 1;
+    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
+
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [n_iter] "m" (n_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm2", "ymm3", "ymm4",
+      "ymm7", "ymm10", "ymm13",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 1;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_1x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sdotxv_ex
+            (
+              conja, conjb, k0,
+              alpha, ai, cs_a0, bj, rs_b0,
+              beta, cij, cntx, NULL
+            );
+        }
+    }
+}
+
diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c
new file mode 100644
index 0000000000..d07ee3ec07
--- /dev/null
+++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4.c
@@ -0,0 +1,1317 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+/*
+   rrc:
+     --------        ------        | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------   +=   ------ ...    | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------        ------              :
+     --------        ------              :
+   Assumptions:
+   - C is row-stored and B is column-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential microkernel is well-suited for
+   a dot-product-based accumulation that performs vector loads from
+   both A and B.
+*/
+
+void bli_zgemmsup_rd_zen_asm_2x4
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter8 = k0 / 8;
+    uint64_t k_left8 = k0 % 8;
+    uint64_t k_iter4 = k_left8 / 4;
+    uint64_t k_left4 = k_left8 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+
+    // Checking whether generic/special case handling is required for beta scaling
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    //-----------------------------------------------------------//
+    // Inline assembly implementation
+
+    begin_asm()
+    mov(var(rs_a), r8)             // load rs_a
+    lea(mem(, r8, 8), r8)
+    lea(mem(, r8, 2), r8)          // r8 = sizeof(dcomplex)*rs_a
+
+    mov(var(cs_b), r10)
+    lea(mem(, r10, 8), r10)
+    lea(mem(, r10, 2), r10)       // r10 = sizeof(dcomplex)*cs_b
+
+    mov(var(rs_c), rdi)
+    lea(mem(, rdi, 8), rdi)
+    lea(mem(, rdi, 2), rdi)       // rdi = sizeof(dcomplex)*rs_c
+
+    mov(imm(0), r15)               // jj = 0
+    label(.ZLOOP3X4J)              // LOOP OVER jj = [ 0 1 ... ]
+    mov(var(a), r14)               // r14 = addr of a
+    mov(var(b), r11)               // r11 = addr of b
+    mov(var(c), r12)               // r12 = addr of c
+
+    lea(mem(, r15, 1), rsi)
+    imul(imm(1*16), rsi)            // rsi = 16*jj
+    lea(mem(r12, rsi, 1), r12)    // r12 += 16*jj
+
+    lea(mem(, r15, 1), rsi)
+    imul(r10, rsi)                 // rsi = 16*jj
+    lea(mem(r11, rsi, 1), r11)    // r12 += cs_b*jj
+
+    vzeroall()                      // Reset all ymm registers
+    mov(r12, rcx)                  // rcx = c_iijj;
+    mov(r11, rbx)                  // rbx = b_jj;
+    mov(r14, rax)                  // rax = a_ii;
+
+    mov(var(k_iter8), rsi)        // i = k_iter8;
+    test(rsi, rsi)                 // Check i via logical AND
+    je(.ZLOOPKLEFT8)               // If i=0 jmp to k_iter4 loop
+
+    label(.ZLOOPKITER8)            // MAIN LOOP
+
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 2
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 4
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER8)
+
+    label(.ZLOOPKLEFT8)
+
+    mov(var(k_iter4), rsi)      // i = k_iter4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZLOOPKLEFT4)             // If i=0 jmp to k_left loop
+    label(.ZLOOPKITER4)
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER4)
+
+    label(.ZLOOPKLEFT4)
+    mov(var(k_left4), rsi)      // i = k_left4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZPOSTACCUM)             // If i=0 jmp to accumalation
+    label(.ZLOOPKLEFT)
+
+    vmovupd(mem(rax), xmm0)
+    vmovupd(mem(rax, r8, 1), xmm1)
+    add(imm(1*16), rax)          // a += 1*sizeof(dcomplex)*cs_a = 1*16*1;
+
+    vmovupd(mem(rbx), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm4)
+    vfmadd231pd(ymm3, ymm1, ymm5)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm7)
+    vfmadd231pd(ymm3, ymm1, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm10)
+    vfmadd231pd(ymm3, ymm1, ymm11)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm13)
+    vfmadd231pd(ymm3, ymm1, ymm14)
+    add(imm(1*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 1*16*1;
+
+    dec(rsi)
+    jne(.ZLOOPKLEFT)
+
+    label(.ZPOSTACCUM)
+    vhsubpd( ymm10, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm4) sum(ymm10)
+    vhaddpd( ymm13, ymm7, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 )  // xmm2 = sum(ymm7) sum(ymm13)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm4)  // xmm4 = sum(ymm4) sum(ymm7)
+    vshufpd(imm(3),xmm2,xmm0,xmm7)  // xmm7 = sum(ymm10) sum(ymm13)
+
+    vhsubpd( ymm11, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm5) sum(ymm11)
+    vhaddpd( ymm14, ymm8, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm8) sum(ymm14)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm5) // xmm5 = sum(ymm5) sum(ymm8)
+    vshufpd(imm(3),xmm2,xmm0,xmm8) // xmm8 = sum(ymm11) sum(ymm14)
+
+    vinsertf128(imm(1),xmm7,ymm4,ymm4)
+    vinsertf128(imm(1),xmm8,ymm5,ymm5)
+
+    //Scaling with alpha
+    mov(var(alpha), rax)
+    vbroadcastsd(mem(rax), ymm0)    // ymm0 = real(alpha)
+    vbroadcastsd(mem(rax, 8), ymm1) // ymm1 = imag(alpha)
+
+    vpermilpd(imm(0x5), ymm4, ymm10)
+    vpermilpd(imm(0x5), ymm5, ymm11)
+
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm10, ymm10)
+    vaddsubpd(ymm10, ymm4, ymm4)
+
+    vmulpd(ymm0, ymm5, ymm5)
+    vmulpd(ymm1, ymm11, ymm11)
+    vaddsubpd(ymm11, ymm5, ymm5)
+
+    // Scaling with beta
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al) // Checking if beta = 0.0
+    je(.BETA_ZERO)
+    mov(var(beta), rbx)
+    vbroadcastsd(mem(rbx), ymm1)    // ymm1 = real(beta)
+    vbroadcastsd(mem(rbx, 8), ymm2) // ymm2 = imag(beta)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_ZERO)
+
+    //Storing in C
+    vmovupd(ymm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm5, mem(rcx))
+
+    label(.ZDONE)
+
+    add(imm(2), r15)              // jj += 2
+    cmp(imm(4), r15)
+    jl(.ZLOOP3X4J)                // Iterate again if jj < 4
+    label(.ZRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [beta_mul_type] "m" (beta_mul_type),
+      [k_iter8] "m" (k_iter8),
+      [k_left8] "m" (k_left8),
+      [k_iter4] "m" (k_iter4),
+      [k_left4] "m" (k_left4),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [b]      "m" (b),
+      [cs_b]   "m" (cs_b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c)
+    : // register clobber list
+      "rax", "rbx", "rdx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r12", "r14", "r15", "r11",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm7", "ymm8",
+      "ymm10", "ymm11", "ymm13", "ymm14",
+      "memory"
+    )
+
+}
+
+void bli_zgemmsup_rd_zen_asm_1x4
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter8 = k0 / 8;
+    uint64_t k_left8 = k0 % 8;
+    uint64_t k_iter4 = k_left8 / 4;
+    uint64_t k_left4 = k_left8 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+
+    // Checking whether generic/special case handling is required for beta scaling
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    //-----------------------------------------------------------//
+    // Inline assembly implementation
+
+    begin_asm()
+    mov(var(rs_a), r8)             // load rs_a
+    lea(mem(, r8, 8), r8)
+    lea(mem(, r8, 2), r8)          // r8 = sizeof(dcomplex)*rs_a
+
+    mov(var(cs_b), r10)
+    lea(mem(, r10, 8), r10)
+    lea(mem(, r10, 2), r10)       // r10 = sizeof(dcomplex)*cs_b
+
+    mov(var(rs_c), rdi)
+    lea(mem(, rdi, 8), rdi)
+    lea(mem(, rdi, 2), rdi)       // rdi = sizeof(dcomplex)*rs_c
+
+    mov(imm(0), r15)               // jj = 0
+    label(.ZLOOP3X4J)              // LOOP OVER jj = [ 0 1 ... ]
+    mov(var(a), r14)               // r14 = addr of a
+    mov(var(b), r11)               // r11 = addr of b
+    mov(var(c), r12)               // r12 = addr of c
+
+    lea(mem(, r15, 1), rsi)
+    imul(imm(1*16), rsi)            // rsi = 16*jj
+    lea(mem(r12, rsi, 1), r12)    // r12 += 16*jj
+
+    lea(mem(, r15, 1), rsi)
+    imul(r10, rsi)                 // rsi = 16*jj
+    lea(mem(r11, rsi, 1), r11)    // r12 += cs_b*jj
+
+    vzeroall()                      // Reset all ymm registers
+    mov(r12, rcx)                  // rcx = c_iijj;
+    mov(r11, rbx)                  // rbx = b_jj;
+    mov(r14, rax)                  // rax = a_ii;
+
+    mov(var(k_iter8), rsi)        // i = k_iter8;
+    test(rsi, rsi)                 // Check i via logical AND
+    je(.ZLOOPKLEFT8)               // If i=0 jmp to k_iter4 loop
+
+    label(.ZLOOPKITER8)            // MAIN LOOP
+
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 2
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 4
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER8)
+
+    label(.ZLOOPKLEFT8)
+
+    mov(var(k_iter4), rsi)      // i = k_iter4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZLOOPKLEFT4)             // If i=0 jmp to k_left loop
+    label(.ZLOOPKITER4)
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER4)
+
+    label(.ZLOOPKLEFT4)
+    mov(var(k_left4), rsi)      // i = k_left4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZPOSTACCUM)             // If i=0 jmp to accumalation
+    label(.ZLOOPKLEFT)
+
+    vmovupd(mem(rax), xmm0)
+    add(imm(1*16), rax)          // a += 1*sizeof(dcomplex)*cs_a = 1*16*1;
+
+    vmovupd(mem(rbx), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm4)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm10)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm13)
+    add(imm(1*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 1*16*1;
+
+    dec(rsi)
+    jne(.ZLOOPKLEFT)
+
+    label(.ZPOSTACCUM)
+    vhsubpd( ymm10, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm4) sum(ymm10)
+    vhaddpd( ymm13, ymm7, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 )  // xmm2 = sum(ymm7) sum(ymm13)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm4)  // xmm4 = sum(ymm4) sum(ymm7)
+    vshufpd(imm(3),xmm2,xmm0,xmm7)  // xmm7 = sum(ymm10) sum(ymm13)
+
+    vinsertf128(imm(1),xmm7,ymm4,ymm4)
+
+    //Scaling with alpha
+    mov(var(alpha), rax)
+    vbroadcastsd(mem(rax), ymm0)    // ymm0 = real(alpha)
+    vbroadcastsd(mem(rax, 8), ymm1) // ymm1 = imag(alpha)
+
+    vpermilpd(imm(0x5), ymm4, ymm10)
+
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm10, ymm10)
+    vaddsubpd(ymm10, ymm4, ymm4)
+
+    // Scaling with beta
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al) // Checking if beta = 0.0
+    je(.BETA_ZERO)
+    mov(var(beta), rbx)
+    vbroadcastsd(mem(rbx), ymm1)    // ymm1 = real(beta)
+    vbroadcastsd(mem(rbx, 8), ymm2) // ymm2 = imag(beta)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_ZERO)
+
+    //Storing in C
+    vmovupd(ymm4, mem(rcx))
+
+    label(.ZDONE)
+
+    add(imm(2), r15)              // jj += 2
+    cmp(imm(4), r15)
+    jl(.ZLOOP3X4J)                // Iterate again if jj < 4
+    label(.ZRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [beta_mul_type] "m" (beta_mul_type),
+      [k_iter8] "m" (k_iter8),
+      [k_left8] "m" (k_left8),
+      [k_iter4] "m" (k_iter4),
+      [k_left4] "m" (k_left4),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [b]      "m" (b),
+      [cs_b]   "m" (cs_b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c)
+    : // register clobber list
+      "rax", "rbx", "rdx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r12", "r14", "r15", "r11",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm7", "ymm10", "ymm13",
+      "memory"
+    )
+
+}
+
+void bli_zgemmsup_rd_zen_asm_2x2
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+
+    uint64_t k_iter8 = k0 / 8;
+    uint64_t k_left8 = k0 % 8;
+    uint64_t k_iter4 = k_left8 / 4;
+    uint64_t k_left4 = k_left8 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+
+    // Checking whether generic/special case handling is required for beta scaling
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    //-----------------------------------------------------------//
+    // Inline assembly implementation
+
+    begin_asm()
+    mov(var(rs_a), r8)             // load rs_a
+    lea(mem(, r8, 8), r8)
+    lea(mem(, r8, 2), r8)          // r8 = sizeof(dcomplex)*rs_a
+
+    mov(var(cs_b), r10)
+    lea(mem(, r10, 8), r10)
+    lea(mem(, r10, 2), r10)       // r10 = sizeof(dcomplex)*cs_b
+
+    mov(var(rs_c), rdi)
+    lea(mem(, rdi, 8), rdi)
+    lea(mem(, rdi, 2), rdi)       // rdi = sizeof(dcomplex)*rs_c
+
+    mov(var(a), r14)               // r14 = addr of a
+    mov(var(b), r11)               // r11 = addr of b
+    mov(var(c), r12)               // r12 = addr of c
+
+    vzeroall()                      // Reset all ymm registers
+    mov(r12, rcx)                  // rcx = c_iijj;
+    mov(r11, rbx)                  // rbx = b_jj;
+    mov(r14, rax)                  // rax = a_ii;
+
+    mov(var(k_iter8), rsi)        // i = k_iter8;
+    test(rsi, rsi)                 // Check i via logical AND
+    je(.ZLOOPKLEFT8)               // If i=0 jmp to k_iter4 loop
+
+    label(.ZLOOPKITER8)            // MAIN LOOP
+
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 2
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 3
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER8)
+
+    label(.ZLOOPKLEFT8)
+
+    mov(var(k_iter4), rsi)      // i = k_iter4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZLOOPKLEFT4)             // If i=0 jmp to k_left loop
+    label(.ZLOOPKITER4)
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER4)
+
+    label(.ZLOOPKLEFT4)
+    mov(var(k_left4), rsi)      // i = k_left4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZPOSTACCUM)             // If i=0 jmp to accumalation
+    label(.ZLOOPKLEFT)
+
+    vmovupd(mem(rax), xmm0)
+    vmovupd(mem(rax, r8, 1), xmm1)
+    add(imm(1*16), rax)          // a += 1*sizeof(dcomplex)*cs_a = 1*16*1;
+
+    vmovupd(mem(rbx), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm4)
+    vfmadd231pd(ymm3, ymm1, ymm5)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm7)
+    vfmadd231pd(ymm3, ymm1, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm10)
+    vfmadd231pd(ymm3, ymm1, ymm11)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm13)
+    vfmadd231pd(ymm3, ymm1, ymm14)
+    add(imm(1*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 1*16*1;
+
+    dec(rsi)
+    jne(.ZLOOPKLEFT)
+
+    label(.ZPOSTACCUM)
+    vhsubpd( ymm10, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm4) sum(ymm10)
+    vhaddpd( ymm13, ymm7, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 )  // xmm2 = sum(ymm7) sum(ymm13)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm4)  // xmm4 = sum(ymm4) sum(ymm7)
+    vshufpd(imm(3),xmm2,xmm0,xmm7)  // xmm7 = sum(ymm10) sum(ymm13)
+
+    vhsubpd( ymm11, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm5) sum(ymm11)
+    vhaddpd( ymm14, ymm8, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm8) sum(ymm14)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm5) // xmm5 = sum(ymm5) sum(ymm8)
+    vshufpd(imm(3),xmm2,xmm0,xmm8) // xmm8 = sum(ymm11) sum(ymm14)
+
+    vinsertf128(imm(1),xmm7,ymm4,ymm4)
+    vinsertf128(imm(1),xmm8,ymm5,ymm5)
+
+    //Scaling with alpha
+    mov(var(alpha), rax)
+    vbroadcastsd(mem(rax), ymm0)    // ymm0 = real(alpha)
+    vbroadcastsd(mem(rax, 8), ymm1) // ymm1 = imag(alpha)
+
+    vpermilpd(imm(0x5), ymm4, ymm10)
+    vpermilpd(imm(0x5), ymm5, ymm11)
+
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm10, ymm10)
+    vaddsubpd(ymm10, ymm4, ymm4)
+
+    vmulpd(ymm0, ymm5, ymm5)
+    vmulpd(ymm1, ymm11, ymm11)
+    vaddsubpd(ymm11, ymm5, ymm5)
+
+    // Scaling with beta
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al) // Checking if beta = 0.0
+    je(.BETA_ZERO)
+    mov(var(beta), rbx)
+    vbroadcastsd(mem(rbx), ymm1)    // ymm1 = real(beta)
+    vbroadcastsd(mem(rbx, 8), ymm2) // ymm2 = imag(beta)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_ZERO)
+
+    //Storing in C
+    vmovupd(ymm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm5, mem(rcx))
+
+    label(.ZDONE)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [beta_mul_type] "m" (beta_mul_type),
+      [k_iter8] "m" (k_iter8),
+      [k_left8] "m" (k_left8),
+      [k_iter4] "m" (k_iter4),
+      [k_left4] "m" (k_left4),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [b]      "m" (b),
+      [cs_b]   "m" (cs_b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c)
+    : // register clobber list
+      "rax", "rbx", "rdx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r12", "r14", "r15", "r11",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm7", "ymm8",
+      "ymm10", "ymm11", "ymm13", "ymm14",
+      "memory"
+    )
+
+}
+
+void bli_zgemmsup_rd_zen_asm_1x2
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+
+    uint64_t k_iter8 = k0 / 8;
+    uint64_t k_left8 = k0 % 8;
+    uint64_t k_iter4 = k_left8 / 4;
+    uint64_t k_left4 = k_left8 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+
+    // Checking whether generic/special case handling is required for beta scaling
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    //-----------------------------------------------------------//
+    // Inline assembly implementation
+
+    begin_asm()
+    mov(var(rs_a), r8)             // load rs_a
+    lea(mem(, r8, 8), r8)
+    lea(mem(, r8, 2), r8)          // r8 = sizeof(dcomplex)*rs_a
+
+    mov(var(cs_b), r10)
+    lea(mem(, r10, 8), r10)
+    lea(mem(, r10, 2), r10)       // r10 = sizeof(dcomplex)*cs_b
+
+    mov(var(rs_c), rdi)
+    lea(mem(, rdi, 8), rdi)
+    lea(mem(, rdi, 2), rdi)       // rdi = sizeof(dcomplex)*rs_c
+
+    mov(var(a), r14)               // r14 = addr of a
+    mov(var(b), r11)               // r11 = addr of b
+    mov(var(c), r12)               // r12 = addr of c
+
+    vzeroall()                      // Reset all ymm registers
+    mov(r12, rcx)                  // rcx = c_iijj;
+    mov(r11, rbx)                  // rbx = b_jj;
+    mov(r14, rax)                  // rax = a_ii;
+
+    mov(var(k_iter8), rsi)        // i = k_iter8;
+    test(rsi, rsi)                 // Check i via logical AND
+    je(.ZLOOPKLEFT8)               // If i=0 jmp to k_iter4 loop
+
+    label(.ZLOOPKITER8)            // MAIN LOOP
+
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 2
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 3
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER8)
+
+    label(.ZLOOPKLEFT8)
+
+    mov(var(k_iter4), rsi)      // i = k_iter4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZLOOPKLEFT4)             // If i=0 jmp to k_left loop
+    label(.ZLOOPKITER4)
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER4)
+
+    label(.ZLOOPKLEFT4)
+    mov(var(k_left4), rsi)      // i = k_left4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZPOSTACCUM)             // If i=0 jmp to accumalation
+    label(.ZLOOPKLEFT)
+
+    vmovupd(mem(rax), xmm0)
+    add(imm(1*16), rax)          // a += 1*sizeof(dcomplex)*cs_a = 1*16*1;
+
+    vmovupd(mem(rbx), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm4)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm7)
+
+    vmovupd(mem(rbx, r10, 1), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm10)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm13)
+    add(imm(1*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 1*16*1;
+
+    dec(rsi)
+    jne(.ZLOOPKLEFT)
+
+    label(.ZPOSTACCUM)
+    vhsubpd( ymm10, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm4) sum(ymm10)
+    vhaddpd( ymm13, ymm7, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 )  // xmm2 = sum(ymm7) sum(ymm13)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm4)  // xmm4 = sum(ymm4) sum(ymm7)
+    vshufpd(imm(3),xmm2,xmm0,xmm7)  // xmm7 = sum(ymm10) sum(ymm13)
+
+    vinsertf128(imm(1),xmm7,ymm4,ymm4)
+
+    //Scaling with alpha
+    mov(var(alpha), rax)
+    vbroadcastsd(mem(rax), ymm0)    // ymm0 = real(alpha)
+    vbroadcastsd(mem(rax, 8), ymm1) // ymm1 = imag(alpha)
+
+    vpermilpd(imm(0x5), ymm4, ymm10)
+
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm10, ymm10)
+    vaddsubpd(ymm10, ymm4, ymm4)
+
+    // Scaling with beta
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al) // Checking if beta = 0.0
+    je(.BETA_ZERO)
+    mov(var(beta), rbx)
+    vbroadcastsd(mem(rbx), ymm1)    // ymm1 = real(beta)
+    vbroadcastsd(mem(rbx, 8), ymm2) // ymm2 = imag(beta)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_ZERO)
+
+    //Storing in C
+    vmovupd(ymm4, mem(rcx))
+
+    label(.ZDONE)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [beta_mul_type] "m" (beta_mul_type),
+      [k_iter8] "m" (k_iter8),
+      [k_left8] "m" (k_left8),
+      [k_iter4] "m" (k_iter4),
+      [k_left4] "m" (k_left4),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [b]      "m" (b),
+      [cs_b]   "m" (cs_b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c)
+    : // register clobber list
+      "rax", "rbx", "rdx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r12", "r14", "r15", "r11",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm7", "ymm10", "ymm13",
+      "memory"
+    )
+
+}
diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c
new file mode 100644
index 0000000000..b8243a04ed
--- /dev/null
+++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4m.c
@@ -0,0 +1,1109 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/*
+   rrc:
+     --------        ------        | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------   +=   ------ ...    | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------        ------               :
+     --------        ------               :
+
+   Assumptions:
+   - C is row-stored and B is column-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential microkernel is well-suited for
+   a dot-product-based accumulation that performs vector loads from
+   both A and B.
+
+   NOTE: These kernels implicitly support column-oriented IO, implemented
+   via an a high-level transposition of the entire operation. A and B will
+   effectively remain row- and column-stored, respectively, but C will then
+   effectively appear column-stored. Thus, this kernel may be used for both
+   rrc and crc cases.
+*/
+
+void bli_zgemmsup_rd_zen_asm_3x4m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Checking for edge case in n dimension in order to
+    // dispatch 3x?m fringe kernels, as required.
+    uint64_t n_left = n0 % 4;
+
+    if ( n_left )
+    {
+        dcomplex* restrict cij = c;
+        dcomplex* restrict bj  = b;
+        dcomplex* restrict ai  = a;
+
+        if ( 2 <= n_left )
+        {
+             const dim_t nr_cur = 2;
+
+             bli_zgemmsup_rd_zen_asm_3x2m
+             (
+               conja, conjb, m0, nr_cur, k0,
+               alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+               beta, cij, rs_c0, cs_c0, data, cntx
+             );
+             cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+             bli_zgemv_ex
+             (
+               BLIS_NO_TRANSPOSE, conjb, m0, k0,
+               alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+               beta, cij, rs_c0, cntx, NULL
+             );
+        }
+        return;
+    }
+
+    uint64_t k_iter8 = k0 / 8;
+    uint64_t k_left8 = k0 % 8;
+    uint64_t k_iter4 = k_left8 / 4;
+    uint64_t k_left4 = k_left8 % 4;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+
+    // Redirecting to m fringe kernels if m_iter = 0
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    char alpha_mul_type = BLIS_MUL_DEFAULT;
+    char beta_mul_type  = BLIS_MUL_DEFAULT;
+
+    // Dealing with special cases of alpha and beta
+    if( alpha->imag == 0.0 ) // If alpha is real
+    {
+      if( alpha->real == 1.0 ) alpha_mul_type = BLIS_MUL_ONE;
+      else if( alpha->real == -1.0 )  alpha_mul_type = BLIS_MUL_MINUS_ONE;
+    }
+
+    if( beta->imag == 0.0 ) // If beta is real
+    {
+        if( beta->real == 1.0 )       beta_mul_type = BLIS_MUL_ONE;
+        else if( beta->real == -1.0 ) beta_mul_type = BLIS_MUL_MINUS_ONE;
+        else if( beta->real == 0.0 )  beta_mul_type = BLIS_MUL_ZERO;
+    }
+
+    //-----------------------------------------------------------//
+    // Inline assembly implementation
+
+    begin_asm()
+    mov(var(rs_a), r8)             // load rs_a
+    lea(mem(, r8, 8), r8)
+    lea(mem(, r8, 2), r8)          // r8 = sizeof(dcomplex)*rs_a
+
+    mov(var(cs_b), r10)
+    lea(mem(, r10, 8), r10)
+    lea(mem(, r10, 2), r10)       // r10 = sizeof(dcomplex)*cs_b
+
+    mov(var(rs_c), rdi)
+    lea(mem(, rdi, 8), rdi)
+    lea(mem(, rdi, 2), rdi)       // rdi = sizeof(dcomplex)*rs_c
+
+    mov(imm(0), r15)               // jj = 0
+    label(.ZLOOP3X4J)              // LOOP OVER jj = [ 0 1 ... ]
+    mov(var(a), r14)               // r14 = addr of a
+    mov(var(b), r11)               // r11 = addr of b
+    mov(var(c), r12)               // r12 = addr of c
+
+    lea(mem(, r15, 1), rsi)
+    imul(imm(1*16), rsi)            // rsi = 16*jj
+    lea(mem(r12, rsi, 1), r12)    // r12 += 16*jj
+
+    lea(mem(, r15, 1), rsi)
+    imul(r10, rsi)                 // rsi = 16*jj
+    lea(mem(r11, rsi, 1), r11)    // r12 += cs_b*jj
+
+    mov(var(m_iter), r9)           // ii = m_iter
+    label(.ZLOOP3X4I)              // LOOP OVER ii
+    vzeroall()                      // Reset all ymm registers
+    mov(r12, rcx)                  // rcx = c_iijj;
+    mov(r11, rbx)                  // rbx = b_jj;
+    mov(r14, rax)                  // rax = a_ii;
+
+    mov(var(k_iter8), rsi)        // i = k_iter8;
+    test(rsi, rsi)                 // Check i via logical AND
+    je(.ZLOOPKLEFT8)               // If i=0 jmp to k_iter4 loop
+
+    label(.ZLOOPKITER8)            // MAIN LOOP
+    /*
+      Load 3 rows from matrix A using ymm0-ymm2.
+      Load 2 columns from B one at a time using ymm3.
+      Compute point wise pdt of ymm0-ymm2 with ymm3.
+      This gives the real part of result, in ymm4-ymm6 and ymm10-ymm12.
+
+      Permute ymm3 after point wise pdt with ymm0-ymm2.
+      Compute another set of point wise pdt in ymm7-ymm9 and ymm13-ymm15.
+
+      Cumulative sum of these registers will give the real and imaginary parts
+      of the result of dot product.
+    */
+
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 2
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 4
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER8)
+
+    label(.ZLOOPKLEFT8)
+
+    mov(var(k_iter4), rsi)      // i = k_iter4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZLOOPKLEFT4)             // If i=0 jmp to k_left loop
+    label(.ZLOOPKITER4)
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER4)
+
+    label(.ZLOOPKLEFT4)
+    mov(var(k_left4), rsi)      // i = k_left4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZPOSTACCUM)             // If i=0 jmp to accumalation
+    label(.ZLOOPKLEFT)
+
+    vmovupd(mem(rax), xmm0)
+    vmovupd(mem(rax, r8, 1), xmm1)
+    vmovupd(mem(rax, r8, 2), xmm2)
+    add(imm(1*16), rax)          // a += 1*sizeof(dcomplex)*cs_a = 1*16*1;
+
+    vmovupd(mem(rbx), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm4)
+    vfmadd231pd(ymm3, ymm1, ymm5)
+    vfmadd231pd(ymm3, ymm2, ymm6)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm7)
+    vfmadd231pd(ymm3, ymm1, ymm8)
+    vfmadd231pd(ymm3, ymm2, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm10)
+    vfmadd231pd(ymm3, ymm1, ymm11)
+    vfmadd231pd(ymm3, ymm2, ymm12)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm13)
+    vfmadd231pd(ymm3, ymm1, ymm14)
+    vfmadd231pd(ymm3, ymm2, ymm15)
+    add(imm(1*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 1*16*1;
+
+    dec(rsi)
+    jne(.ZLOOPKLEFT)
+
+    label(.ZPOSTACCUM)
+    vhsubpd( ymm10, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm4) sum(ymm10)
+    vhaddpd( ymm13, ymm7, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 )  // xmm2 = sum(ymm7) sum(ymm13)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm4)  // xmm4 = sum(ymm4) sum(ymm7)
+    vshufpd(imm(3),xmm2,xmm0,xmm7)  // xmm7 = sum(ymm10) sum(ymm13)
+
+    vhsubpd( ymm11, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm5) sum(ymm11)
+    vhaddpd( ymm14, ymm8, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm8) sum(ymm14)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm5) // xmm5 = sum(ymm5) sum(ymm8)
+    vshufpd(imm(3),xmm2,xmm0,xmm8) // xmm8 = sum(ymm11) sum(ymm14)
+
+    vhsubpd( ymm12, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm6) sum(ymm12)
+    vhaddpd( ymm15, ymm9, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm9) sum(ymm15)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm6) // xmm6 = sum(ymm6) sum(ymm9)
+    vshufpd(imm(3),xmm2,xmm0,xmm9) // xmm9 = sum(ymm12) sum(ymm15)
+
+    vinsertf128(imm(1),xmm7,ymm4,ymm4)
+    vinsertf128(imm(1),xmm8,ymm5,ymm5)
+    vinsertf128(imm(1),xmm9,ymm6,ymm6)
+
+    //Scaling with alpha
+    mov(var(alpha_mul_type), al)
+    cmp(imm(0xFF), al) // Checking if alpha = -1.0
+    jne(.ALPHA_NOT_MINUS1)
+
+    vxorpd(ymm0, ymm0, ymm0)
+    vsubpd(ymm4, ymm0, ymm4)      // ymm4 = -ymm4
+    vsubpd(ymm5, ymm0, ymm5)      // ymm5 = -ymm5
+    vsubpd(ymm6, ymm0, ymm6)      // ymm6 = -ymm6
+
+    jmp(.BETA_SCALING)
+
+    label(.ALPHA_NOT_MINUS1)
+    cmp(imm(2), al) // Checking for BLIS_MUL_DEFAULT
+    jne(.BETA_SCALING)
+    mov(var(alpha), rax)
+    vbroadcastsd(mem(rax), ymm0)    // ymm0 = real(alpha)
+    vbroadcastsd(mem(rax, 8), ymm1) // ymm1 = imag(alpha)
+
+    vpermilpd(imm(0x5), ymm4, ymm10)
+    vpermilpd(imm(0x5), ymm5, ymm11)
+    vpermilpd(imm(0x5), ymm6, ymm12)
+
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm10, ymm10)
+    vaddsubpd(ymm10, ymm4, ymm4)
+
+    vmulpd(ymm0, ymm5, ymm5)
+    vmulpd(ymm1, ymm11, ymm11)
+    vaddsubpd(ymm11, ymm5, ymm5)
+
+    vmulpd(ymm0, ymm6, ymm6)
+    vmulpd(ymm1, ymm12, ymm12)
+    vaddsubpd(ymm12, ymm6, ymm6)
+
+    label(.BETA_SCALING)
+    // Scaling with beta
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al) // Checking if beta = 0.0
+    je(.BETA_ZERO)
+    cmp(imm(2), al) // Checking for BLIS_MUL_DEFAULT
+    je(.BETA_NOT_REAL_ONE)
+    cmp(imm(0xFF), al)
+    je(.BETA_REAL_MINUS1) // Checking if beta = -1.0
+    // Handling when beta == 1
+    vmovupd(mem(rcx), ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vaddpd(ymm0,ymm6,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    // Handling when beta == -1
+    label(.BETA_REAL_MINUS1)
+    vmovupd(mem(rcx), ymm0)
+    vsubpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vsubpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vsubpd(ymm0,ymm6,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_NOT_REAL_ONE)
+    mov(var(beta), rbx)
+    vbroadcastsd(mem(rbx), ymm1)    // ymm1 = real(beta)
+    vbroadcastsd(mem(rbx, 8), ymm2) // ymm2 = imag(beta)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm6,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_ZERO)
+
+    //Storing in C
+    vmovupd(ymm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm6, mem(rcx))
+
+    label(.ZDONE)
+    lea(mem(r12, rdi, 2), r12)
+    lea(mem(r12, rdi, 1), r12)    // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)
+    lea(mem(r14, r8,  1), r14)    // a_ii = r14 += 3*rs_a
+
+    dec(r9)                       // ii -= 1;
+    jne(.ZLOOP3X4I)               // Iterating again if ii != 0
+
+    add(imm(2), r15)              // jj += 2
+    cmp(imm(4), r15)
+    jl(.ZLOOP3X4J)                // Iterate again if jj < 4
+    label(.ZRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [alpha_mul_type] "m" (alpha_mul_type),
+      [beta_mul_type] "m" (beta_mul_type),
+      [m_iter] "m" (m_iter),
+      [k_iter8] "m" (k_iter8),
+      [k_left8] "m" (k_left8),
+      [k_iter4] "m" (k_iter4),
+      [k_left4] "m" (k_left4),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [b]      "m" (b),
+      [cs_b]   "m" (cs_b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c)
+    : // register clobber list
+      "rax", "rbx", "rdx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r12", "r14", "r15", "r11",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+
+    // Handling edge cases in m dimension if they exist
+    consider_edge_cases:
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 4;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        dcomplex* restrict cij = c + i_edge*rs_c;
+        dcomplex* restrict bj  = b;
+        dcomplex* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+             const dim_t mr_cur = 2;
+
+             bli_zgemmsup_rd_zen_asm_2x4
+             (
+               conja, conjb, mr_cur, nr_cur, k0,
+               alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+               beta, cij, rs_c0, cs_c0, data, cntx
+             );
+        }
+        if ( 1 == m_left )
+        {
+             const dim_t mr_cur = 1;
+
+             bli_zgemmsup_rd_zen_asm_1x4
+             (
+               conja, conjb, mr_cur, nr_cur, k0,
+               alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+               beta, cij, rs_c0, cs_c0, data, cntx
+             );
+        }
+    }
+
+}
+
+void bli_zgemmsup_rd_zen_asm_3x2m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+
+    uint64_t k_iter8 = k0 / 8;
+    uint64_t k_left8 = k0 % 8;
+    uint64_t k_iter4 = k_left8 / 4;
+    uint64_t k_left4 = k_left8 % 4;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    // Checking whether generic/special case handling is required for beta scaling
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    //-----------------------------------------------------------//
+    // Inline assembly implementation
+
+    begin_asm()
+    mov(var(rs_a), r8)             // load rs_a
+    lea(mem(, r8, 8), r8)
+    lea(mem(, r8, 2), r8)          // r8 = sizeof(dcomplex)*rs_a
+
+    mov(var(cs_b), r10)
+    lea(mem(, r10, 8), r10)
+    lea(mem(, r10, 2), r10)       // r10 = sizeof(dcomplex)*cs_b
+
+    mov(var(rs_c), rdi)
+    lea(mem(, rdi, 8), rdi)
+    lea(mem(, rdi, 2), rdi)       // rdi = sizeof(dcomplex)*rs_c
+
+    mov(var(a), r14)               // r14 = addr of a
+    mov(var(b), r11)               // r11 = addr of b
+    mov(var(c), r12)               // r12 = addr of c
+
+    mov(var(m_iter), r9)           // ii = m_iter
+    label(.ZLOOP3X4I)              // LOOP OVER ii
+    vzeroall()                      // Reset all ymm registers
+    mov(r12, rcx)                  // rcx = c_iijj;
+    mov(r11, rbx)                  // rbx = b_jj;
+    mov(r14, rax)                  // rax = a_ii;
+
+    mov(var(k_iter8), rsi)        // i = k_iter8;
+    test(rsi, rsi)                 // Check i via logical AND
+    je(.ZLOOPKLEFT8)               // If i=0 jmp to k_iter4 loop
+
+    label(.ZLOOPKITER8)            // MAIN LOOP
+
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 2
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 4
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER8)
+
+    label(.ZLOOPKLEFT8)
+
+    mov(var(k_iter4), rsi)      // i = k_iter4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZLOOPKLEFT4)             // If i=0 jmp to k_left loop
+    label(.ZLOOPKITER4)
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER4)
+
+    label(.ZLOOPKLEFT4)
+    mov(var(k_left4), rsi)      // i = k_left4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZPOSTACCUM)             // If i=0 jmp to accumalation
+    label(.ZLOOPKLEFT)
+
+    vmovupd(mem(rax), xmm0)
+    vmovupd(mem(rax, r8, 1), xmm1)
+    vmovupd(mem(rax, r8, 2), xmm2)
+    add(imm(1*16), rax)          // a += 1*sizeof(dcomplex)*cs_a = 1*16*1;
+
+    vmovupd(mem(rbx), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm4)
+    vfmadd231pd(ymm3, ymm1, ymm5)
+    vfmadd231pd(ymm3, ymm2, ymm6)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm7)
+    vfmadd231pd(ymm3, ymm1, ymm8)
+    vfmadd231pd(ymm3, ymm2, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm10)
+    vfmadd231pd(ymm3, ymm1, ymm11)
+    vfmadd231pd(ymm3, ymm2, ymm12)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm13)
+    vfmadd231pd(ymm3, ymm1, ymm14)
+    vfmadd231pd(ymm3, ymm2, ymm15)
+    add(imm(1*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 1*16*1;
+
+    dec(rsi)
+    jne(.ZLOOPKLEFT)
+
+    label(.ZPOSTACCUM)
+    vhsubpd( ymm10, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm4) sum(ymm10)
+    vhaddpd( ymm13, ymm7, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 )  // xmm2 = sum(ymm7) sum(ymm13)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm4)  // xmm4 = sum(ymm4) sum(ymm7)
+    vshufpd(imm(3),xmm2,xmm0,xmm7)  // xmm7 = sum(ymm10) sum(ymm13)
+
+    vhsubpd( ymm11, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm5) sum(ymm11)
+    vhaddpd( ymm14, ymm8, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm8) sum(ymm14)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm5) // xmm5 = sum(ymm5) sum(ymm8)
+    vshufpd(imm(3),xmm2,xmm0,xmm8) // xmm8 = sum(ymm11) sum(ymm14)
+
+    vhsubpd( ymm12, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm6) sum(ymm12)
+    vhaddpd( ymm15, ymm9, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm9) sum(ymm15)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm6) // xmm6 = sum(ymm6) sum(ymm9)
+    vshufpd(imm(3),xmm2,xmm0,xmm9) // xmm9 = sum(ymm12) sum(ymm15)
+
+    vinsertf128(imm(1),xmm7,ymm4,ymm4)
+    vinsertf128(imm(1),xmm8,ymm5,ymm5)
+    vinsertf128(imm(1),xmm9,ymm6,ymm6)
+
+    //Scaling with alpha
+    mov(var(alpha), rax)
+    vbroadcastsd(mem(rax), ymm0)    // ymm0 = real(alpha)
+    vbroadcastsd(mem(rax, 8), ymm1) // ymm1 = imag(alpha)
+
+    vpermilpd(imm(0x5), ymm4, ymm10)
+    vpermilpd(imm(0x5), ymm5, ymm11)
+    vpermilpd(imm(0x5), ymm6, ymm12)
+
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm10, ymm10)
+    vaddsubpd(ymm10, ymm4, ymm4)
+
+    vmulpd(ymm0, ymm5, ymm5)
+    vmulpd(ymm1, ymm11, ymm11)
+    vaddsubpd(ymm11, ymm5, ymm5)
+
+    vmulpd(ymm0, ymm6, ymm6)
+    vmulpd(ymm1, ymm12, ymm12)
+    vaddsubpd(ymm12, ymm6, ymm6)
+
+    // Scaling with beta
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al) // Checking if beta = 0.0
+    je(.BETA_ZERO)
+    mov(var(beta), rbx)
+    vbroadcastsd(mem(rbx), ymm1)    // ymm1 = real(beta)
+    vbroadcastsd(mem(rbx, 8), ymm2) // ymm2 = imag(beta)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm6,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_ZERO)
+
+    //Storing in C
+    vmovupd(ymm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm6, mem(rcx))
+
+    label(.ZDONE)
+
+    lea(mem(r12, rdi, 2), r12)
+    lea(mem(r12, rdi, 1), r12)    // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)
+    lea(mem(r14, r8,  1), r14)    // a_ii = r14 += 3*rs_a
+
+    dec(r9)                       // ii -= 1;
+    jne(.ZLOOP3X4I)               // Iterating again if ii != 0
+    label(.ZRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [beta_mul_type] "m" (beta_mul_type),
+      [m_iter] "m" (m_iter),
+      [k_iter8] "m" (k_iter8),
+      [k_left8] "m" (k_left8),
+      [k_iter4] "m" (k_iter4),
+      [k_left4] "m" (k_left4),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [b]      "m" (b),
+      [cs_b]   "m" (cs_b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c)
+    : // register clobber list
+      "rax", "rbx", "rdx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r12", "r14", "r15", "r11",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+
+    // Handling edge cases in m dimension if they exist
+    consider_edge_cases:
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 2;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        dcomplex* restrict cij = c + i_edge*rs_c;
+        dcomplex* restrict bj  = b;
+        dcomplex* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+             const dim_t mr_cur = 2;
+
+             bli_zgemmsup_rd_zen_asm_2x2
+             (
+               conja, conjb, mr_cur, nr_cur, k0,
+               alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+               beta, cij, rs_c0, cs_c0, data, cntx
+             );
+        }
+        if ( 1 == m_left )
+        {
+             const dim_t mr_cur = 1;
+
+             bli_zgemmsup_rd_zen_asm_1x2
+             (
+               conja, conjb, mr_cur, nr_cur, k0,
+               alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+               beta, cij, rs_c0, cs_c0, data, cntx
+             );
+        }
+    }
+
+}
diff --git a/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c
new file mode 100644
index 0000000000..8223e756f3
--- /dev/null
+++ b/kernels/zen/3/sup/bli_gemmsup_rd_zen_asm_z3x4n.c
@@ -0,0 +1,1062 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/*
+   rrc:
+     --------        ------        | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------   +=   ------ ...    | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------        ------               :
+     --------        ------               :
+
+   Assumptions:
+   - C is row-stored and B is column-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential microkernel is well-suited for
+   a dot-product-based accumulation that performs vector loads from
+   both A and B.
+
+   NOTE: These kernels implicitly support column-oriented IO, implemented
+   via an a high-level transposition of the entire operation. A and B will
+   effectively remain row- and column-stored, respectively, but C will then
+   effectively appear column-stored. Thus, this kernel may be used for both
+   rrc and crc cases.
+*/
+
+void bli_zgemmsup_rd_zen_asm_3x4n
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Checking for edge case in m dimension in order to
+    // dispatch ?x4n fringe kernels, as required.
+    uint64_t m_left = m0 % 3;
+
+    if ( m_left )
+    {
+        dcomplex* restrict cij = c;
+        dcomplex* restrict bj  = b;
+        dcomplex* restrict ai  = a;
+
+        if ( 2 == m_left )
+        {
+             const dim_t mr_cur = 2;
+
+             bli_zgemmsup_rd_zen_asm_2x4n
+             (
+               conja, conjb, mr_cur, n0, k0,
+               alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+               beta, cij, rs_c0, cs_c0, data, cntx
+             );
+        }
+        if ( 1 == m_left )
+        {
+            bli_zgemv_ex
+            (
+              BLIS_TRANSPOSE, conja, k0, n0,
+              alpha, bj, rs_b0, cs_b0, ai, cs_a0,
+              beta, cij, cs_c0, cntx, NULL
+            );
+        }
+        return;
+    }
+
+    uint64_t k_iter8 = k0 / 8;
+    uint64_t k_left8 = k0 % 8;
+    uint64_t k_iter4 = k_left8 / 4;
+    uint64_t k_left4 = k_left8 % 4;
+
+    uint64_t n_iter = n0 / 2;
+    uint64_t n_left = n0 % 2;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+
+    // Redirecting to n fringe kernels if n_iter = 0
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    char alpha_mul_type = BLIS_MUL_DEFAULT;
+    char beta_mul_type  = BLIS_MUL_DEFAULT;
+
+    // Dealing with special cases of alpha and beta
+    if( alpha->imag == 0.0 ) // If alpha is real
+    {
+      if( alpha->real == 1.0 ) alpha_mul_type = BLIS_MUL_ONE;
+      else if( alpha->real == -1.0 )  alpha_mul_type = BLIS_MUL_MINUS_ONE;
+    }
+
+    if( beta->imag == 0.0 ) // If beta is real
+    {
+        if( beta->real == 1.0 )       beta_mul_type = BLIS_MUL_ONE;
+        else if( beta->real == -1.0 ) beta_mul_type = BLIS_MUL_MINUS_ONE;
+        else if( beta->real == 0.0 )  beta_mul_type = BLIS_MUL_ZERO;
+    }
+
+    //-----------------------------------------------------------//
+    // Inline assembly implementation
+
+    begin_asm()
+    mov(var(rs_a), r8)             // load rs_a
+    lea(mem(, r8, 8), r8)
+    lea(mem(, r8, 2), r8)          // r8 = sizeof(dcomplex)*rs_a
+
+    mov(var(cs_b), r10)
+    lea(mem(, r10, 8), r10)
+    lea(mem(, r10, 2), r10)       // r10 = sizeof(dcomplex)*cs_b
+
+    mov(var(rs_c), rdi)
+    lea(mem(, rdi, 8), rdi)
+    lea(mem(, rdi, 2), rdi)       // rdi = sizeof(dcomplex)*rs_c
+
+    mov(var(a), r14)               // r14 = addr of a
+    mov(var(b), r11)               // r11 = addr of b
+    mov(var(c), r12)               // r12 = addr of c
+
+    mov(var(n_iter), r9)           // jj = n_iter
+    label(.ZLOOP3x2J)              // LOOP OVER jj
+    vzeroall()                      // Reset all ymm registers
+    mov(r14, rax)                  // rax = a_ii
+    mov(r12, rcx)                  // rcx = c_jj;
+    mov(r11, rbx)                  // rbx = b_jj;
+
+    mov(var(k_iter8), rsi)        // i = k_iter8;
+    test(rsi, rsi)                 // Check i via logical AND
+    je(.ZLOOPKLEFT8)               // If i=0 jmp to k_iter4 loop
+
+    label(.ZLOOPKITER8)            // MAIN LOOP
+    /*
+      Load 3 rows from matrix A using ymm0-ymm2.
+      Load 2 columns from B one at a time using ymm3.
+      Compute point wise pdt of ymm0-ymm2 with ymm3.
+      This gives the real part of result, in ymm4-ymm6 and ymm10-ymm12.
+
+      Permute ymm3 after point wise pdt with ymm0-ymm2.
+      Compute another set of point wise pdt in ymm7-ymm9 and ymm13-ymm15.
+
+      Cumulative sum of these registers will give the real and imaginary parts
+      of the result of dot product.
+    */
+
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 2
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 4
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER8)
+
+    label(.ZLOOPKLEFT8)
+
+    mov(var(k_iter4), rsi)      // i = k_iter4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZLOOPKLEFT4)             // If i=0 jmp to k_left loop
+    label(.ZLOOPKITER4)
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    vmovupd(mem(rax, r8, 2), ymm2)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+    vfmadd231pd(ymm2, ymm3, ymm6)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+    vfmadd231pd(ymm2, ymm3, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+    vfmadd231pd(ymm2, ymm3, ymm12)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    vfmadd231pd(ymm2, ymm3, ymm15)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER4)
+
+    label(.ZLOOPKLEFT4)
+    mov(var(k_left4), rsi)      // i = k_left4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZPOSTACCUM)             // If i=0 jmp to accumalation
+    label(.ZLOOPKLEFT)
+
+    vmovupd(mem(rax), xmm0)
+    vmovupd(mem(rax, r8, 1), xmm1)
+    vmovupd(mem(rax, r8, 2), xmm2)
+    add(imm(1*16), rax)          // a += 1*sizeof(dcomplex)*cs_a = 1*16*1;
+
+    vmovupd(mem(rbx), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm4)
+    vfmadd231pd(ymm3, ymm1, ymm5)
+    vfmadd231pd(ymm3, ymm2, ymm6)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm7)
+    vfmadd231pd(ymm3, ymm1, ymm8)
+    vfmadd231pd(ymm3, ymm2, ymm9)
+
+    vmovupd(mem(rbx, r10, 1), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm10)
+    vfmadd231pd(ymm3, ymm1, ymm11)
+    vfmadd231pd(ymm3, ymm2, ymm12)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm13)
+    vfmadd231pd(ymm3, ymm1, ymm14)
+    vfmadd231pd(ymm3, ymm2, ymm15)
+    add(imm(1*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 1*16*1;
+
+    dec(rsi)
+    jne(.ZLOOPKLEFT)
+
+    label(.ZPOSTACCUM)
+    vhsubpd( ymm10, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm4) sum(ymm10)
+    vhaddpd( ymm13, ymm7, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 )  // xmm2 = sum(ymm7) sum(ymm13)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm4)  // xmm4 = sum(ymm4) sum(ymm7)
+    vshufpd(imm(3),xmm2,xmm0,xmm7)  // xmm7 = sum(ymm10) sum(ymm13)
+
+    vhsubpd( ymm11, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm5) sum(ymm11)
+    vhaddpd( ymm14, ymm8, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm8) sum(ymm14)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm5) // xmm5 = sum(ymm5) sum(ymm8)
+    vshufpd(imm(3),xmm2,xmm0,xmm8) // xmm8 = sum(ymm11) sum(ymm14)
+
+    vhsubpd( ymm12, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm6) sum(ymm12)
+    vhaddpd( ymm15, ymm9, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm9) sum(ymm15)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm6) // xmm6 = sum(ymm6) sum(ymm9)
+    vshufpd(imm(3),xmm2,xmm0,xmm9) // xmm9 = sum(ymm12) sum(ymm15)
+
+    vinsertf128(imm(1),xmm7,ymm4,ymm4)
+    vinsertf128(imm(1),xmm8,ymm5,ymm5)
+    vinsertf128(imm(1),xmm9,ymm6,ymm6)
+
+    //Scaling with alpha
+    mov(var(alpha_mul_type), al)
+    cmp(imm(0xFF), al) // Checking if alpha = -1.0
+    jne(.ALPHA_NOT_MINUS1)
+
+    vxorpd(ymm0, ymm0, ymm0)
+    vsubpd(ymm4, ymm0, ymm4)      // ymm4 = -ymm4
+    vsubpd(ymm5, ymm0, ymm5)      // ymm5 = -ymm5
+    vsubpd(ymm6, ymm0, ymm6)      // ymm6 = -ymm6
+
+    jmp(.BETA_SCALING)
+
+    label(.ALPHA_NOT_MINUS1)
+    cmp(imm(2), al) // Checking for BLIS_MUL_DEFAULT
+    jne(.BETA_SCALING)
+    mov(var(alpha), rax)
+    vbroadcastsd(mem(rax), ymm0)    // ymm0 = real(alpha)
+    vbroadcastsd(mem(rax, 8), ymm1) // ymm1 = imag(alpha)
+
+    vpermilpd(imm(0x5), ymm4, ymm10)
+    vpermilpd(imm(0x5), ymm5, ymm11)
+    vpermilpd(imm(0x5), ymm6, ymm12)
+
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm10, ymm10)
+    vaddsubpd(ymm10, ymm4, ymm4)
+
+    vmulpd(ymm0, ymm5, ymm5)
+    vmulpd(ymm1, ymm11, ymm11)
+    vaddsubpd(ymm11, ymm5, ymm5)
+
+    vmulpd(ymm0, ymm6, ymm6)
+    vmulpd(ymm1, ymm12, ymm12)
+    vaddsubpd(ymm12, ymm6, ymm6)
+
+    label(.BETA_SCALING)
+    // Scaling with beta
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al) // Checking if beta = 0.0
+    je(.BETA_ZERO)
+    cmp(imm(2), al) // Checking for BLIS_MUL_DEFAULT
+    je(.BETA_NOT_REAL_ONE)
+    cmp(imm(0xFF), al)
+    je(.BETA_REAL_MINUS1) // Checking if beta = -1.0
+    // Handling when beta == 1
+    vmovupd(mem(rcx), ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vaddpd(ymm0,ymm6,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    // Handling when beta == -1
+    label(.BETA_REAL_MINUS1)
+    vmovupd(mem(rcx), ymm0)
+    vsubpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vsubpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vsubpd(ymm0,ymm6,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_NOT_REAL_ONE)
+    mov(var(beta), rbx)
+    vbroadcastsd(mem(rbx), ymm1)    // ymm1 = real(beta)
+    vbroadcastsd(mem(rbx, 8), ymm2) // ymm2 = imag(beta)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm6,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_ZERO)
+
+    //Storing in C
+    vmovupd(ymm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm6, mem(rcx))
+
+    label(.ZDONE)
+    add(imm(2*16), r12)    // c_jj = r12 += 2*cs_c
+    lea(mem(r11, r10,  2), r11)   // b_jj = r11 += 2*cs_b
+
+    dec(r9)                       // jj -= 1;
+    jne(.ZLOOP3x2J)               // Iterating again if jj != 0
+    label(.ZRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [alpha_mul_type] "m" (alpha_mul_type),
+      [beta_mul_type] "m" (beta_mul_type),
+      [n_iter] "m" (n_iter),
+      [k_iter8] "m" (k_iter8),
+      [k_left8] "m" (k_left8),
+      [k_iter4] "m" (k_iter4),
+      [k_left4] "m" (k_left4),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [b]      "m" (b),
+      [cs_b]   "m" (cs_b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c)
+    : // register clobber list
+      "rax", "rbx", "rdx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r12", "r14", "r11",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+
+    // Handling edge cases in m dimension if they exist
+    consider_edge_cases:
+    if ( 1 == n_left )
+    {
+        const dim_t      mr_cur = 3;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        dcomplex* restrict cij = c + j_edge;
+        dcomplex* restrict bj  = b + j_edge*cs_b;
+        dcomplex* restrict ai  = a;
+
+        bli_zgemv_ex
+        (
+          BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+          alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+          beta, cij, rs_c0, cntx, NULL
+        );
+    }
+
+}
+
+void bli_zgemmsup_rd_zen_asm_2x4n
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter8 = k0 / 8;
+    uint64_t k_left8 = k0 % 8;
+    uint64_t k_iter4 = k_left8 / 4;
+    uint64_t k_left4 = k_left8 % 4;
+
+    uint64_t n_iter = n0 / 2;
+    uint64_t n_left = n0 % 2;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+
+    // Redirecting to m fringe kernels if m_iter = 0
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    char alpha_mul_type = BLIS_MUL_DEFAULT;
+    char beta_mul_type  = BLIS_MUL_DEFAULT;
+
+    // Dealing with special cases of alpha and beta
+    if( alpha->imag == 0.0 ) // If alpha is real
+    {
+      if( alpha->real == 1.0 ) alpha_mul_type = BLIS_MUL_ONE;
+      else if( alpha->real == -1.0 )  alpha_mul_type = BLIS_MUL_MINUS_ONE;
+    }
+
+    if( beta->imag == 0.0 ) // If beta is real
+    {
+        if( beta->real == 1.0 )       beta_mul_type = BLIS_MUL_ONE;
+        else if( beta->real == -1.0 ) beta_mul_type = BLIS_MUL_MINUS_ONE;
+        else if( beta->real == 0.0 )  beta_mul_type = BLIS_MUL_ZERO;
+    }
+
+    //-----------------------------------------------------------//
+    // Inline assembly implementation
+
+    begin_asm()
+    mov(var(rs_a), r8)             // load rs_a
+    lea(mem(, r8, 8), r8)
+    lea(mem(, r8, 2), r8)          // r8 = sizeof(dcomplex)*rs_a
+
+    mov(var(cs_b), r10)
+    lea(mem(, r10, 8), r10)
+    lea(mem(, r10, 2), r10)       // r10 = sizeof(dcomplex)*cs_b
+
+    mov(var(rs_c), rdi)
+    lea(mem(, rdi, 8), rdi)
+    lea(mem(, rdi, 2), rdi)       // rdi = sizeof(dcomplex)*rs_c
+
+    mov(var(a), r14)               // rax = addr of a
+    mov(var(b), r11)               // r11 = addr of b
+    mov(var(c), r12)               // r12 = addr of c
+
+    mov(var(n_iter), r9)           // jj = m_iter
+    label(.ZLOOP3x2J)              // LOOP OVER jj
+    vzeroall()                      // Reset all ymm registers
+    mov(r14, rax)
+    mov(r12, rcx)                  // rcx = c_jj;
+    mov(r11, rbx)                  // rbx = b_jj;
+
+    mov(var(k_iter8), rsi)        // i = k_iter8;
+    test(rsi, rsi)                 // Check i via logical AND
+    je(.ZLOOPKLEFT8)               // If i=0 jmp to k_iter4 loop
+
+    label(.ZLOOPKITER8)            // MAIN LOOP
+    /*
+      Load 3 rows from matrix A using ymm0-ymm2.
+      Load 2 columns from B one at a time using ymm3.
+      Compute point wise pdt of ymm0-ymm2 with ymm3.
+      This gives the real part of result, in ymm4-ymm6 and ymm10-ymm12.
+
+      Permute ymm3 after point wise pdt with ymm0-ymm2.
+      Compute another set of point wise pdt in ymm7-ymm9 and ymm13-ymm15.
+
+      Cumulative sum of these registers will give the real and imaginary parts
+      of the result of dot product.
+    */
+
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 2
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 3
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER8)
+
+    label(.ZLOOPKLEFT8)
+
+    mov(var(k_iter4), rsi)      // i = k_iter4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZLOOPKLEFT4)             // If i=0 jmp to k_left loop
+    label(.ZLOOPKITER4)
+    // ---------------------------------- Iteration 0
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    // ---------------------------------- Iteration 1
+    vmovupd(mem(rax), ymm0)
+    vmovupd(mem(rax, r8, 1), ymm1)
+    add(imm(2*16), rax)            // a += 2*sizeof(dcomplex)*cs_a = 2*16;
+
+    vmovupd(mem(rbx), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm4)
+    vfmadd231pd(ymm1, ymm3, ymm5)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm7)
+    vfmadd231pd(ymm1, ymm3, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vpermilpd(imm(0x5), ymm3, ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm13)
+    vfmadd231pd(ymm1, ymm3, ymm14)
+    add(imm(2*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 2*16;
+
+    dec(rsi)
+    jne(.ZLOOPKITER4)
+
+    label(.ZLOOPKLEFT4)
+    mov(var(k_left4), rsi)      // i = k_left4;
+    test(rsi, rsi)               // Check i via logical AND
+    je(.ZPOSTACCUM)             // If i=0 jmp to accumalation
+    label(.ZLOOPKLEFT)
+
+    vmovupd(mem(rax), xmm0)
+    vmovupd(mem(rax, r8, 1), xmm1)
+    add(imm(1*16), rax)          // a += 1*sizeof(dcomplex)*cs_a = 1*16*1;
+
+    vmovupd(mem(rbx), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm4)
+    vfmadd231pd(ymm3, ymm1, ymm5)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm7)
+    vfmadd231pd(ymm3, ymm1, ymm8)
+
+    vmovupd(mem(rbx, r10, 1), xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm10)
+    vfmadd231pd(ymm3, ymm1, ymm11)
+
+    vpermilpd(imm(0x1), xmm3, xmm3)
+    vfmadd231pd(ymm3, ymm0, ymm13)
+    vfmadd231pd(ymm3, ymm1, ymm14)
+    add(imm(1*16), rbx)          // b += 2*sizeof(dcomplex)*rs_b = 1*16*1;
+
+    dec(rsi)
+    jne(.ZLOOPKLEFT)
+
+    label(.ZPOSTACCUM)
+    vhsubpd( ymm10, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm4) sum(ymm10)
+    vhaddpd( ymm13, ymm7, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 )  // xmm2 = sum(ymm7) sum(ymm13)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm4)  // xmm4 = sum(ymm4) sum(ymm7)
+    vshufpd(imm(3),xmm2,xmm0,xmm7)  // xmm7 = sum(ymm10) sum(ymm13)
+
+    vhsubpd( ymm11, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddpd( xmm0, xmm1, xmm0 ) // xmm0 = sum(ymm5) sum(ymm11)
+    vhaddpd( ymm14, ymm8, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddpd( xmm2, xmm1, xmm2 ) // xmm2 = sum(ymm8) sum(ymm14)
+
+    vshufpd(imm(0),xmm2,xmm0,xmm5) // xmm5 = sum(ymm5) sum(ymm8)
+    vshufpd(imm(3),xmm2,xmm0,xmm8) // xmm8 = sum(ymm11) sum(ymm14)
+
+    vinsertf128(imm(1),xmm7,ymm4,ymm4)
+    vinsertf128(imm(1),xmm8,ymm5,ymm5)
+
+    //Scaling with alpha
+    mov(var(alpha_mul_type), al)
+    cmp(imm(0xFF), al) // Checking if alpha = -1.0
+    jne(.ALPHA_NOT_MINUS1)
+
+    vxorpd(ymm0, ymm0, ymm0)
+    vsubpd(ymm4, ymm0, ymm4)      // ymm4 = -ymm4
+    vsubpd(ymm5, ymm0, ymm5)      // ymm5 = -ymm5
+
+    jmp(.BETA_SCALING)
+
+    label(.ALPHA_NOT_MINUS1)
+    cmp(imm(2), al) // Checking for BLIS_MUL_DEFAULT
+    jne(.BETA_SCALING)
+    mov(var(alpha), rax)
+    vbroadcastsd(mem(rax), ymm0)    // ymm0 = real(alpha)
+    vbroadcastsd(mem(rax, 8), ymm1) // ymm1 = imag(alpha)
+
+    vpermilpd(imm(0x5), ymm4, ymm10)
+    vpermilpd(imm(0x5), ymm5, ymm11)
+
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm10, ymm10)
+    vaddsubpd(ymm10, ymm4, ymm4)
+
+    vmulpd(ymm0, ymm5, ymm5)
+    vmulpd(ymm1, ymm11, ymm11)
+    vaddsubpd(ymm11, ymm5, ymm5)
+
+    label(.BETA_SCALING)
+    // Scaling with beta
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al) // Checking if beta = 0.0
+    je(.BETA_ZERO)
+    cmp(imm(2), al) // Checking for BLIS_MUL_DEFAULT
+    je(.BETA_NOT_REAL_ONE)
+    cmp(imm(0xFF), al)
+    je(.BETA_REAL_MINUS1) // Checking if beta = -1.0
+    // Handling when beta == 1
+    vmovupd(mem(rcx), ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    // Handling when beta == -1
+    label(.BETA_REAL_MINUS1)
+    vmovupd(mem(rcx), ymm0)
+    vsubpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vsubpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_NOT_REAL_ONE)
+    mov(var(beta), rbx)
+    vbroadcastsd(mem(rbx), ymm1)    // ymm1 = real(beta)
+    vbroadcastsd(mem(rbx, 8), ymm2) // ymm2 = imag(beta)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm4,ymm0)
+    vmovupd(ymm0, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(mem(rcx), ymm0)
+    vpermilpd(imm(0x5), ymm0, ymm3)
+    vmulpd(ymm1, ymm0, ymm0)
+    vmulpd(ymm2, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm0, ymm0)
+    vaddpd(ymm0,ymm5,ymm0)
+    vmovupd(ymm0, mem(rcx))
+
+    jmp(.ZDONE)
+
+    label(.BETA_ZERO)
+
+    //Storing in C
+    vmovupd(ymm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm5, mem(rcx))
+
+    label(.ZDONE)
+    add(imm(2*16), r12)    // c_jj = r12 += 2*cs_c
+    lea(mem(r11, r10,  2), r11)   // b_jj = r11 += 2*cs_b
+
+    dec(r9)                       // jj -= 1;
+    jne(.ZLOOP3x2J)               // Iterating again if jj != 0
+    label(.ZRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [alpha_mul_type] "m" (alpha_mul_type),
+      [beta_mul_type] "m" (beta_mul_type),
+      [n_iter] "m" (n_iter),
+      [k_iter8] "m" (k_iter8),
+      [k_left8] "m" (k_left8),
+      [k_iter4] "m" (k_iter4),
+      [k_left4] "m" (k_left4),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [b]      "m" (b),
+      [cs_b]   "m" (cs_b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c)
+    : // register clobber list
+      "rax", "rbx", "r11", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r12", "r14", "r11",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm7", "ymm8",
+      "ymm10", "ymm11", "ymm13", "ymm14",
+      "memory"
+    )
+
+    // Handling edge cases in m dimension if they exist
+    consider_edge_cases:
+    if ( 1 == n_left )
+    {
+        const dim_t      mr_cur = 2;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        dcomplex* restrict cij = c + j_edge;
+        dcomplex* restrict bj  = b + j_edge*cs_b;
+        dcomplex* restrict ai  = a;
+
+        bli_zgemv_ex
+        (
+          BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+          alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+          beta, cij, rs_c0, cntx, NULL
+        );
+    }
+
+}
diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c
index 03c1627f15..386c2ca8f0 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -548,6 +548,9 @@ void bli_cgemmsup_rv_zen_asm_2x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 }
@@ -910,6 +913,8 @@ void bli_cgemmsup_rv_zen_asm_1x8
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 }
@@ -1286,6 +1291,8 @@ void bli_cgemmsup_rv_zen_asm_2x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6", "ymm8", "ymm10",
 	  "memory"
 	)
 }
@@ -1604,6 +1611,8 @@ void bli_cgemmsup_rv_zen_asm_1x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6", "ymm8", "ymm10",
 	  "memory"
 	)
 }
diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c
index 8d10406a05..f92b1cc17b 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_c3x8m.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -739,6 +739,10 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1230,6 +1234,9 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6", "ymm8", "ymm10",
+	  "ymm12", "ymm14",
 	  "memory"
 	)
 
@@ -1753,4 +1760,4 @@ void bli_cgemmsup_rv_zen_asm_3x2m
 	}
 }
 
- 
\ No newline at end of file
+ 
diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c
index 7befbb69bb..2cb3a844cc 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -717,12 +717,16 @@ void bli_sgemmsup_rv_zen_asm_5x16
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
     : // register clobber list
-     "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+     "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
      "xmm0", "xmm1", "xmm2", "xmm3",
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm1", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -1213,12 +1217,16 @@ void bli_sgemmsup_rv_zen_asm_4x16
       [a_next] "m" (a_next),
       [b_next] "m" (b_next)*/
     : // register clobber list
-     "rax", "rbx", "rcx", "rdx", "rsi", "rdi", 
+     "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
      "xmm0", "xmm1", "xmm2", "xmm3",
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm1", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -1779,6 +1787,10 @@ void bli_sgemmsup_rv_zen_asm_3x16
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm1", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -2172,6 +2184,10 @@ void bli_sgemmsup_rv_zen_asm_2x16
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm1", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -2533,6 +2549,10 @@ void bli_sgemmsup_rv_zen_asm_1x16
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm1", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -2981,6 +3001,10 @@ void bli_sgemmsup_rv_zen_asm_6x8
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm1", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -3434,6 +3458,10 @@ void bli_sgemmsup_rv_zen_asm_5x8
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm1", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -3800,6 +3828,10 @@ void bli_sgemmsup_rv_zen_asm_4x8
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm1", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -4195,6 +4227,10 @@ void bli_sgemmsup_rv_zen_asm_3x8
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -4504,6 +4540,10 @@ void bli_sgemmsup_rv_zen_asm_2x8
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -4767,6 +4807,10 @@ void bli_sgemmsup_rv_zen_asm_1x8
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -5168,6 +5212,10 @@ void bli_sgemmsup_rv_zen_asm_6x4
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -5556,6 +5604,10 @@ void bli_sgemmsup_rv_zen_asm_5x4
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -5894,6 +5946,9 @@ void bli_sgemmsup_rv_zen_asm_4x4
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -6219,6 +6274,9 @@ void bli_sgemmsup_rv_zen_asm_3x4
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -6492,6 +6550,9 @@ void bli_sgemmsup_rv_zen_asm_2x4
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -6746,6 +6807,10 @@ void bli_sgemmsup_rv_zen_asm_1x4
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm3", "ymm4", "ymm5",
+     "ymm6", "ymm7", "ymm8", "ymm9",
+     "ymm10", "ymm11", "ymm12", "ymm13",
+     "ymm14", "ymm15",
      "memory"
     )
 }
@@ -7133,6 +7198,9 @@ void bli_sgemmsup_rv_zen_asm_6x2
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -7506,6 +7574,9 @@ void bli_sgemmsup_rv_zen_asm_5x2
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -7842,6 +7913,10 @@ void bli_sgemmsup_rv_zen_asm_4x2
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -8144,6 +8219,10 @@ void bli_sgemmsup_rv_zen_asm_3x2
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -8406,6 +8485,10 @@ void bli_sgemmsup_rv_zen_asm_2x2
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm2", "ymm3",
+     "ymm4", "ymm5", "ymm6", "ymm7",
+     "ymm8", "ymm9", "ymm10", "ymm11",
+     "ymm12", "ymm13", "ymm14", "ymm15",
      "memory"
     )
 }
@@ -8643,6 +8726,10 @@ void bli_sgemmsup_rv_zen_asm_1x2
      "xmm4", "xmm5", "xmm6", "xmm7",
      "xmm8", "xmm9", "xmm10", "xmm11",
      "xmm12", "xmm13", "xmm14", "xmm15",
+     "ymm0", "ymm2","ymm4", "ymm5",
+     "ymm6", "ymm7", "ymm8", "ymm9",
+     "ymm10", "ymm11", "ymm12", "ymm13",
+     "ymm14", "ymm15",
      "memory"
     )
 }
diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c
index d5e2135a66..19acd5a1b6 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16m.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-2022, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -895,6 +895,10 @@ void bli_sgemmsup_rv_zen_asm_6x16m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1439,6 +1443,9 @@ void bli_sgemmsup_rv_zen_asm_6x8m
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6", "ymm8", "ymm10",
+	  "ymm12", "ymm14",
 	  "memory"
 	)
 
diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c
index f46244d668..eb690e9f6c 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_s6x16n.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2021, Advanced Micro Devices, Inc.
+   Copyright (C) 2021-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -855,6 +855,10 @@ void bli_sgemmsup_rv_zen_asm_6x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -1621,6 +1625,10 @@ void bli_sgemmsup_rv_zen_asm_5x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13",
 	  "memory"
 	)
 
@@ -2230,6 +2238,9 @@ void bli_sgemmsup_rv_zen_asm_4x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 
@@ -2876,6 +2887,10 @@ void bli_sgemmsup_rv_zen_asm_3x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm8", "ymm9", "ymm10", "ymm11",
+	  "ymm12", "ymm13", "ymm14", "ymm15",
 	  "memory"
 	)
 
@@ -3366,6 +3381,9 @@ void bli_sgemmsup_rv_zen_asm_2x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm11", "ymm12",
 	  "memory"
 	)
 
@@ -3821,6 +3839,8 @@ void bli_sgemmsup_rv_zen_asm_1x16n
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5",
 	  "memory"
 	)
 
diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c
index 787d3f772b..298ede7204 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020-2021, Advanced Micro Devices, Inc.
+   Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -515,6 +515,9 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
+	  "ymm8", "ymm9", "ymm10", "ymm11",
 	  "memory"
 	)
 
@@ -875,6 +878,8 @@ void bli_zgemmsup_rv_zen_asm_1x4
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm5", "ymm6", "ymm7",
 	  "memory"
 	)
 
@@ -1236,6 +1241,8 @@ void bli_zgemmsup_rv_zen_asm_2x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6", "ymm8", "ymm10",
 	  "memory"
 	)
 }
@@ -1543,6 +1550,8 @@ void bli_zgemmsup_rv_zen_asm_1x2
 	  "xmm4", "xmm5", "xmm6", "xmm7",
 	  "xmm8", "xmm9", "xmm10", "xmm11",
 	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "ymm0", "ymm1", "ymm2", "ymm3",
+	  "ymm4", "ymm6",
 	  "memory"
 	)
 
diff --git a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c
index 64aedb8791..804e196e12 100644
--- a/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c
+++ b/kernels/zen/3/sup/bli_gemmsup_rv_zen_asm_z3x4m.c
@@ -1,1295 +1,1302 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2021, Advanced Micro Devices, Inc.All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/* Assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
-   and store outputs to ymm0
-   (creal,cimag)*(betar,beati) where c is stored in col major order*/
-#define ZGEMM_INPUT_SCALE_CS_BETA_NZ \
-    vmovupd(mem(rcx), xmm0) \
-    vmovupd(mem(rcx, rsi, 1), xmm3) \
-    vinsertf128(imm(1), xmm3, ymm0, ymm0) \
-    vpermilpd(imm(0x5), ymm0, ymm3) \
-    vmulpd(ymm1, ymm0, ymm0) \
-    vmulpd(ymm2, ymm3, ymm3) \
-    vaddsubpd(ymm3, ymm0, ymm0)
-
-//(creal,cimag)*(betar,beati) where c is stored in row major order
-#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \
-    vmovupd(mem(rcx), ymm0) \
-    vpermilpd(imm(0x5), ymm0, ymm3) \
-    vmulpd(ymm1, ymm0, ymm0) \
-    vmulpd(ymm2, ymm3, ymm3) \
-    vaddsubpd(ymm3, ymm0, ymm0)
-
-#define ZGEMM_INPUT_RS_BETA_ONE \
-    vmovupd(mem(rcx), ymm0)
-
-#define ZGEMM_OUTPUT_RS \
-    vmovupd(ymm0, mem(rcx)) \
-
-/*(cNextRowreal,cNextRowimag)*(betar,beati)
-   where c is stored in row major order
-   rsi = cs_c * sizeof((real +imag)dt)*numofElements
-   numofElements = 2, 2 elements are processed at a time*/
-#define ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT \
-    vmovupd(mem(rcx, rsi, 1), ymm0) \
-    vpermilpd(imm(0x5), ymm0, ymm3) \
-    vmulpd(ymm1, ymm0, ymm0) \
-    vmulpd(ymm2, ymm3, ymm3) \
-    vaddsubpd(ymm3, ymm0, ymm0)
-
-#define ZGEMM_INPUT_RS_BETA_ONE_NEXT \
-    vmovupd(mem(rcx, rsi, 1), ymm0)
-
-#define ZGEMM_OUTPUT_RS_NEXT \
-    vmovupd(ymm0, mem(rcx, rsi, 1))
-
-/*
-   rrr:
-     --------        ------        --------
-     --------   +=   ------ ...    --------
-     --------        ------        --------
-     --------        ------            :
-
-   rcr:
-     --------        | | | |       --------
-     --------   +=   | | | | ...   --------
-     --------        | | | |       --------
-     --------        | | | |           :
-
-   Assumptions:
-   - B is row-stored;
-   - A is row- or column-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential kernel is well-suited for contiguous
-   (v)ector loads on B and single-element broadcasts from A.
-
-   NOTE: These kernels explicitly support column-oriented IO, implemented
-   via an in-register transpose. And thus they also support the crr and
-   ccr cases, though only crr is ever utilized (because ccr is handled by
-   transposing the operation and executing rcr, which does not incur the
-   cost of the in-register transpose).
-
-   crr:
-     | | | | | | | |       ------        --------
-     | | | | | | | |  +=   ------
-     --------
-     | | | | | | | |       ------        --------
-     | | | | | | | |       ------            :
-*/
-void bli_zgemmsup_rv_zen_asm_3x4m
-     (
-       conj_t       conja,
-       conj_t       conjb,
-       dim_t        m0,
-       dim_t        n0,
-       dim_t        k0,
-       dcomplex*    restrict alpha,
-       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       dcomplex*    restrict beta,
-       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
-     )
-{
-    uint64_t n_left = n0 % 4;
-
-    // First check whether this is a edge case in the n dimension. If so,
-    // dispatch other 3x?m kernels, as needed.
-    if (n_left )
-    {
-        dcomplex*  cij = c;
-        dcomplex*  bj  = b;
-        dcomplex*  ai  = a;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_zgemmsup_rv_zen_asm_3x2m
-            (
-              conja, conjb, m0, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_zgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, m0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-
-        return;
-    }
-
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-
-    uint64_t k_iter = k0 / 4;
-    uint64_t k_left = k0 % 4;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
-
-    char alpha_mul_type = BLIS_MUL_DEFAULT;
-    char beta_mul_type  = BLIS_MUL_DEFAULT;
-
-    //handling case when alpha and beta are real and +/-1.
-
-    if(alpha->imag == 0.0)// (alpha is real)
-    {
-        if(alpha->real == 1.0)          alpha_mul_type = BLIS_MUL_ONE;
-        else if(alpha->real == -1.0)    alpha_mul_type = BLIS_MUL_MINUS_ONE;
-        else if(alpha->real == 0.0)     alpha_mul_type = BLIS_MUL_ZERO;
-    }
-
-    if(beta->imag == 0.0)// (beta is real)
-    {
-        if(beta->real == 1.0)       beta_mul_type = BLIS_MUL_ONE;
-        else if(beta->real == -1.0) beta_mul_type = BLIS_MUL_MINUS_ONE;
-        else if(beta->real == 0.0)  beta_mul_type = BLIS_MUL_ZERO;
-    }
-
-    // -------------------------------------------------------------------------
-
-    begin_asm()
-
-    mov(var(a), r14)                   // load address of a.
-    mov(var(rs_a), r8)                 // load rs_a
-    mov(var(cs_a), r9)                 // load cs_a
-    lea(mem(, r8, 8), r8)              // rs_a *= sizeof(real dt)
-    lea(mem(, r8, 2), r8)              // rs_a *= sizeof((real + imag) dt)
-    lea(mem(, r9, 8), r9)              // cs_a *= sizeof( real dt)
-    lea(mem(, r9, 2), r9)              // cs_a *= sizeof((real + imag) dt)
-
-    mov(var(rs_b), r10)                // load rs_b
-    lea(mem(, r10, 8), r10)            // rs_b *= sizeof(real dt)
-    lea(mem(, r10, 2), r10)            // rs_b *= sizeof((real +imag) dt)
-
-                                       // NOTE: We cannot pre-load elements of a or b
-                                       // because it could eventually, in the last
-                                       // unrolled iter or the cleanup loop, result
-                                       // in reading beyond the bounds allocated mem
-                                       // (the likely result: a segmentation fault).
-
-    mov(var(c), r12)                   // load address of c
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(dt)
-    lea(mem(, rdi, 2), rdi)            // rs_c *= sizeof(dt)
-
-    // During preamble and loops:
-    // r12 = rcx = c
-    // r14 = rax = a
-    // read rbx from var(b) near beginning of loop
-    // r11 = m dim index ii
-
-    mov(var(m_iter), r11)              // ii = m_iter;
-
-    label(.ZLOOP3X4I)                 // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-    vzeroall()                         // zero all xmm/ymm registers.
-
-    mov(var(b), rbx)                   // load address of b.
-    mov(r14, rax)                      // reset rax to current upanel of a.
-
-    cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
-    jz(.ZCOLPFETCH)                    // jump to column storage case
-    label(.ZROWPFETCH)                 // row-stored pre-fetching on c // not used
-
-    jmp(.ZPOSTPFETCH)                  // jump to end of pre-fetching c
-    label(.ZCOLPFETCH)                 // column-stored pre-fetching c
-
-    mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-    lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(dt)
-    label(.ZPOSTPFETCH)                // done prefetching c
-
-    mov(var(k_iter), rsi)              // i = k_iter;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.ZCONSIDKLEFT)                  // if i == 0, jump to code that
-                                       // contains the k_left loop.
-
-    label(.ZLOOPKITER)                 // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-
-    vmovupd(mem(rbx,  0*32), ymm0)
-    vmovupd(mem(rbx,  1*32), ymm1)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-    vfmadd231pd(ymm1, ymm2, ymm5)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-    vfmadd231pd(ymm1, ymm2, ymm9)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-    vfmadd231pd(ymm1, ymm2, ymm13)
-
-    vbroadcastsd(mem(rax, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-    vfmadd231pd(ymm1, ymm3, ymm7)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-    vfmadd231pd(ymm1, ymm3, ymm11)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-    vfmadd231pd(ymm1, ymm3, ymm15)
-
-    add(r9, rax)                       // a += cs_a;
-
-    // ---------------------------------- iteration 1
-
-    vmovupd(mem(rbx,  0*32), ymm0)
-    vmovupd(mem(rbx,  1*32), ymm1)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-    vfmadd231pd(ymm1, ymm2, ymm5)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-    vfmadd231pd(ymm1, ymm2, ymm9)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-    vfmadd231pd(ymm1, ymm2, ymm13)
-
-    vbroadcastsd(mem(rax, 8    ), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-    vfmadd231pd(ymm1, ymm3, ymm7)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-    vfmadd231pd(ymm1, ymm3, ymm11)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-    vfmadd231pd(ymm1, ymm3, ymm15)
-
-    add(r9, rax)                       // a += cs_a;
-
-    // ---------------------------------- iteration 2
-
-    vmovupd(mem(rbx,  0*32), ymm0)
-    vmovupd(mem(rbx,  1*32), ymm1)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-    vfmadd231pd(ymm1, ymm2, ymm5)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-    vfmadd231pd(ymm1, ymm2, ymm9)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-    vfmadd231pd(ymm1, ymm2, ymm13)
-
-    vbroadcastsd(mem(rax, 8 ), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-    vfmadd231pd(ymm1, ymm3, ymm7)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-    vfmadd231pd(ymm1, ymm3, ymm11)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-    vfmadd231pd(ymm1, ymm3, ymm15)
-
-    add(r9, rax)                       // a += cs_a;
-
-    // ---------------------------------- iteration 3
-    vmovupd(mem(rbx, 0*32), ymm0)
-    vmovupd(mem(rbx, 1*32), ymm1)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-    vfmadd231pd(ymm1, ymm2, ymm5)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-    vfmadd231pd(ymm1, ymm2, ymm9)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-    vfmadd231pd(ymm1, ymm2, ymm13)
-
-    vbroadcastsd(mem(rax, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-    vfmadd231pd(ymm1, ymm3, ymm7)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-    vfmadd231pd(ymm1, ymm3, ymm11)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-    vfmadd231pd(ymm1, ymm3, ymm15)
-
-    add(r9, rax)                       // a += cs_a;
-
-    dec(rsi)                           // i -= 1;
-    jne(.ZLOOPKITER)                   // iterate again if i != 0.
-
-    label(.ZCONSIDKLEFT)
-
-    mov(var(k_left), rsi)              // i = k_left;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.ZPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left loop.
-
-    label(.ZLOOPKLEFT)                 // EDGE LOOP
-
-    vmovupd(mem(rbx,  0*32), ymm0)
-    vmovupd(mem(rbx,  1*32), ymm1)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-    vfmadd231pd(ymm1, ymm2, ymm5)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-    vfmadd231pd(ymm1, ymm2, ymm9)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-    vfmadd231pd(ymm1, ymm2, ymm13)
-
-    vbroadcastsd(mem(rax, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-    vfmadd231pd(ymm1, ymm3, ymm7)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-    vfmadd231pd(ymm1, ymm3, ymm11)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-    vfmadd231pd(ymm1, ymm3, ymm15)
-
-    add(r9, rax)                       // a += cs_a;
-
-    dec(rsi)                           // i -= 1;
-    jne(.ZLOOPKLEFT)                   // iterate again if i != 0.
-
-    label(.ZPOSTACCUM)
-
-    mov(r12, rcx)                      // reset rcx to current utile of c.
-
-    // permute even and odd elements
-     // of ymm6/7, ymm10/11, ymm/14/15
-    vpermilpd(imm(0x5), ymm6, ymm6)
-    vpermilpd(imm(0x5), ymm7, ymm7)
-    vpermilpd(imm(0x5), ymm10, ymm10)
-    vpermilpd(imm(0x5), ymm11, ymm11)
-    vpermilpd(imm(0x5), ymm14, ymm14)
-    vpermilpd(imm(0x5), ymm15, ymm15)
-
-     // subtract/add even/odd elements
-    vaddsubpd(ymm6, ymm4, ymm4)
-    vaddsubpd(ymm7, ymm5, ymm5)
-
-    vaddsubpd(ymm10, ymm8, ymm8)
-    vaddsubpd(ymm11, ymm9, ymm9)
-
-    vaddsubpd(ymm14, ymm12, ymm12)
-    vaddsubpd(ymm15, ymm13, ymm13)
-
-    mov(var(cs_c), rsi)        // load cs_c
-    lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
-    lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real +imag)dt)
-
-    //if(alpha_mul_type == BLIS_MUL_MINUS_ONE)
-    mov(var(alpha_mul_type), al)
-    cmp(imm(0xFF), al)
-    jne(.ALPHA_NOT_MINUS1)
-
-    // when alpha = -1 and real.
-    vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
-    vsubpd(ymm4, ymm0, ymm4)
-    vsubpd(ymm5, ymm0, ymm5)
-    vsubpd(ymm8, ymm0, ymm8)
-    vsubpd(ymm9, ymm0, ymm9)
-    vsubpd(ymm12, ymm0, ymm12)
-    vsubpd(ymm13, ymm0, ymm13)
-    jmp(.ALPHA_REAL_ONE)
-
-    label(.ALPHA_NOT_MINUS1)
-    //when alpha is real and +/-1, multiplication is skipped.
-    cmp(imm(2), al)//if(alpha_mul_type != BLIS_MUL_DEFAULT) skip below multiplication.
-    jne(.ALPHA_REAL_ONE)
-
-    /* (ar + ai) x AB */
-    mov(var(alpha), rax)             // load address of alpha
-    vbroadcastsd(mem(rax), ymm0)     // load alpha_r and duplicate
-    vbroadcastsd(mem(rax, 8), ymm1)  // load alpha_i and duplicate
-
-    vpermilpd(imm(0x5), ymm4, ymm3)
-    vmulpd(ymm0, ymm4, ymm4)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm4, ymm4)
-
-    vpermilpd(imm(0x5), ymm5, ymm3)
-    vmulpd(ymm0, ymm5, ymm5)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm5, ymm5)
-
-    vpermilpd(imm(0x5), ymm8, ymm3)
-    vmulpd(ymm0, ymm8, ymm8)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm8, ymm8)
-
-    vpermilpd(imm(0x5), ymm9, ymm3)
-    vmulpd(ymm0, ymm9, ymm9)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm9, ymm9)
-
-    vpermilpd(imm(0x5), ymm12, ymm3)
-    vmulpd(ymm0, ymm12, ymm12)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm12, ymm12)
-
-    vpermilpd(imm(0x5), ymm13, ymm3)
-    vmulpd(ymm0, ymm13, ymm13)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm13, ymm13)
-
-    label(.ALPHA_REAL_ONE)
-    // Beta multiplication
-    /* (br + bi)x C + ((ar + ai) x AB) */
-
-    mov(var(beta_mul_type), al)
-    cmp(imm(0), al)                    //if(beta_mul_type == BLIS_MUL_ZERO)
-    je(.ZBETAZERO)                     //jump to beta == 0 case
-
-    cmp(imm(16), rdi)                  // set ZF if (16*rs_c) ==16.
-    jz(.ZCOLSTORED)                    // jump to column storage case
-
-    label(.ZROWSTORED)
-
-    lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real +imag)dt) * numofElements
-
-    cmp(imm(2), al)                    // if(beta_mul_type == BLIS_MUL_DEFAULT)
-    je(.ROW_BETA_NOT_REAL_ONE)         // jump to beta handling with multiplication.
-
-    cmp(imm(0xFF), al)                 // if(beta_mul_type == BLIS_MUL_MINUS_ONE)
-    je(.ROW_BETA_REAL_MINUS1)          // jump to beta real = -1 section.
-
-    //CASE 1: beta is real = 1
-    ZGEMM_INPUT_RS_BETA_ONE
-    vaddpd(ymm4, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_RS_BETA_ONE_NEXT
-    vaddpd(ymm5, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    add(rdi, rcx) // rcx = c + 1*rs_c
-
-    ZGEMM_INPUT_RS_BETA_ONE
-    vaddpd(ymm8, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_RS_BETA_ONE_NEXT
-    vaddpd(ymm9, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    add(rdi, rcx) // rcx = c + 2*rs_c
-
-    ZGEMM_INPUT_RS_BETA_ONE
-    vaddpd(ymm12, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_RS_BETA_ONE_NEXT
-    vaddpd(ymm13, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    jmp(.ZDONE)
-
-
-    //CASE 2: beta is real = -1
-    label(.ROW_BETA_REAL_MINUS1)
-    ZGEMM_INPUT_RS_BETA_ONE
-    vsubpd(ymm0, ymm4, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_RS_BETA_ONE_NEXT
-    vsubpd(ymm0, ymm5, ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    add(rdi, rcx) // rcx = c + 1*rs_c
-
-    ZGEMM_INPUT_RS_BETA_ONE
-    vsubpd(ymm0, ymm8, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_RS_BETA_ONE_NEXT
-    vsubpd(ymm0, ymm9, ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    add(rdi, rcx) // rcx = c + 2*rs_c
-
-    ZGEMM_INPUT_RS_BETA_ONE
-    vsubpd(ymm0, ymm12, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_RS_BETA_ONE_NEXT
-    vsubpd(ymm0, ymm13,  ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    jmp(.ZDONE)
-
-
-    //CASE 3: Default case with multiplication
-    // beta not equal to (+/-1) or zero, do normal multiplication.
-    label(.ROW_BETA_NOT_REAL_ONE)
-    mov(var(beta), rbx)             // load address of beta
-    vbroadcastsd(mem(rbx), ymm1)    // load beta_r and duplicate
-    vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ
-    vaddpd(ymm4, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
-    vaddpd(ymm5, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    add(rdi, rcx) // rcx = c + 1*rs_c
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ
-    vaddpd(ymm8, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
-    vaddpd(ymm9, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    add(rdi, rcx) // rcx = c + 2*rs_c
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ
-    vaddpd(ymm12, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
-    vaddpd(ymm13, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS_NEXT
-    jmp(.ZDONE)                        // jump to end.
-
-    label(.ZCOLSTORED)
-    mov(var(beta), rbx)              // load address of beta
-    vbroadcastsd(mem(rbx), ymm1)     // load beta_r and duplicate
-    vbroadcastsd(mem(rbx, 8), ymm2)  // load beta_i and duplicate
-    /*|--------|           |-------|
-      |        |           |       |
-      |    3x4 |           |  4x3  |
-      |--------|           |-------|
-    */
-
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm4, ymm0, ymm4)
-
-    add(rdi, rcx)
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm8, ymm0, ymm8)
-    add(rdi, rcx)
-
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm12, ymm0, ymm12)
-
-    lea(mem(r12, rsi, 2), rcx)
-
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm5, ymm0, ymm5)
-    add(rdi, rcx)
-
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm9, ymm0, ymm9)
-    add(rdi, rcx)
-
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm13, ymm0, ymm13)
-
-    mov(r12, rcx)                      // reset rcx to current utile of c.
-
-
-    /****3x4 tile going to save into 4x3 tile in C*****/
-
-    /******************Transpose top tile 4x3***************************/
-    vmovups(xmm4, mem(rcx))
-    vmovups(xmm8, mem(rcx, 16))
-    vmovups(xmm12, mem(rcx,32))
-
-    add(rsi, rcx)
-
-    vextractf128(imm(0x1), ymm4, xmm4)
-    vextractf128(imm(0x1), ymm8, xmm8)
-    vextractf128(imm(0x1), ymm12, xmm12)
-    vmovups(xmm4, mem(rcx))
-    vmovups(xmm8, mem(rcx, 16))
-    vmovups(xmm12, mem(rcx,32))
-
-    add(rsi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    vmovups(xmm9, mem(rcx, 16))
-    vmovups(xmm13,mem(rcx,32))
-
-    add(rsi, rcx)
-
-    vextractf128(imm(0x1), ymm5, xmm5)
-    vextractf128(imm(0x1), ymm9, xmm9)
-    vextractf128(imm(0x1), ymm13, xmm13)
-    vmovups(xmm5, mem(rcx))
-    vmovups(xmm9, mem(rcx, 16))
-    vmovups(xmm13,mem(rcx,32))
-
-    jmp(.ZDONE)                        // jump to end.
-
-    label(.ZBETAZERO)
-    cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
-    jz(.ZCOLSTORBZ)                    // jump to column storage case
-
-    label(.ZROWSTORBZ)
-    /* Store 3x4 elements to C matrix where is C row major order*/
-
-    // rsi = cs_c * sizeof((real +imag)dt) *numofElements
-    lea(mem(, rsi, 2), rsi)
-
-    vmovupd(ymm4, mem(rcx))
-    vmovupd(ymm5, mem(rcx, rsi, 1))
-    add(rdi, rcx)
-
-    vmovupd(ymm8, mem(rcx))
-    vmovupd(ymm9, mem(rcx, rsi, 1))
-    add(rdi, rcx)
-
-    vmovupd(ymm12, mem(rcx))
-    vmovupd(ymm13, mem(rcx, rsi, 1))
-
-    jmp(.ZDONE)                        // jump to end.
-
-    label(.ZCOLSTORBZ)
-
-    /****3x4 tile going to save into 4x3 tile in C*****/
-
-    /******************Transpose top tile 4x3***************************/
-    vmovups(xmm4, mem(rcx))
-    vmovups(xmm8, mem(rcx, 16))
-    vmovups(xmm12, mem(rcx,32))
-
-    add(rsi, rcx)
-
-    vextractf128(imm(0x1), ymm4, xmm4)
-    vextractf128(imm(0x1), ymm8, xmm8)
-    vextractf128(imm(0x1), ymm12, xmm12)
-    vmovups(xmm4, mem(rcx))
-    vmovups(xmm8, mem(rcx, 16))
-    vmovups(xmm12, mem(rcx,32))
-
-    add(rsi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    vmovups(xmm9, mem(rcx, 16))
-    vmovups(xmm13,mem(rcx,32))
-
-    add(rsi, rcx)
-
-    vextractf128(imm(0x1), ymm5, xmm5)
-    vextractf128(imm(0x1), ymm9, xmm9)
-    vextractf128(imm(0x1), ymm13, xmm13)
-    vmovups(xmm5, mem(rcx))
-    vmovups(xmm9, mem(rcx, 16))
-    vmovups(xmm13,mem(rcx,32))
-
-    label(.ZDONE)
-
-    lea(mem(r12, rdi, 2), r12)
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)
-    lea(mem(r14, r8,  1), r14)         //a_ii = r14 += 3*rs_a
-
-    dec(r11)                           // ii -= 1;
-    jne(.ZLOOP3X4I)                    // iterate again if ii != 0.
-
-    label(.ZRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [alpha_mul_type] "m" (alpha_mul_type),
-      [beta_mul_type] "m" (beta_mul_type),
-      [m_iter] "m" (m_iter),
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 4;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        dcomplex*  cij = c + i_edge*rs_c;
-        dcomplex*  ai  = a + i_edge*rs_a;
-        dcomplex*  bj  = b;
-
-        zgemmsup_ker_ft ker_fps[3] =
-        {
-          NULL,
-          bli_zgemmsup_rv_zen_asm_1x4,
-          bli_zgemmsup_rv_zen_asm_2x4,
-        };
-
-        zgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
-
-        ker_fp
-        (
-          conja, conjb, m_left, nr_cur, k0,
-          alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-          beta, cij, rs_c0, cs_c0, data, cntx
-        );
-        return;
-
-    }
-
-}
-
-void bli_zgemmsup_rv_zen_asm_3x2m
-     (
-       conj_t       conja,
-       conj_t       conjb,
-       dim_t        m0,
-       dim_t        n0,
-       dim_t        k0,
-       dcomplex*    restrict alpha,
-       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       dcomplex*    restrict beta,
-       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
-
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-
-    uint64_t k_iter = k0 / 4;
-    uint64_t k_left = k0 % 4;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-
-    begin_asm()
-
-    mov(var(a), r14)                   // load address of a.
-    mov(var(rs_a), r8)                 // load rs_a
-    mov(var(cs_a), r9)                 // load cs_a
-    lea(mem(, r8, 8), r8)              // rs_a *= sizeof(dt)
-    lea(mem(, r8, 2), r8)              // rs_a *= sizeof(dt)
-    lea(mem(, r9, 8), r9)              // cs_a *= sizeof(dt)
-    lea(mem(, r9, 2), r9)              // cs_a *= sizeof(dt)
-
-    mov(var(rs_b), r10)                // load rs_b
-    lea(mem(, r10, 8), r10)            // rs_b *= sizeof(dt)
-    lea(mem(, r10, 2), r10)            // rs_b *= sizeof(dt)
-
-                                       // NOTE: We cannot pre-load elements of a or b
-                                       // because it could eventually, in the last
-                                       // unrolled iter or the cleanup loop, result
-                                       // in reading beyond the bounds allocated mem
-                                       // (the likely result: a segmentation fault).
-
-    mov(var(c), r12)                   // load address of c
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(dt)
-    lea(mem(, rdi, 2), rdi)            // rs_c *= sizeof(dt)
-
-    // During preamble and loops:
-    // r12 = rcx = c
-    // r14 = rax = a
-    // read rbx from var(b) near beginning of loop
-    // r11 = m dim index ii
-
-    mov(var(m_iter), r11)              // ii = m_iter;
-
-    label(.ZLOOP3X2I)                 // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-    vzeroall()                         // zero all xmm/ymm registers.
-
-    mov(var(b), rbx)                   // load address of b.
-    mov(r14, rax)                      // reset rax to current upanel of a.
-
-    cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
-    jz(.ZCOLPFETCH)                    // jump to column storage case
-    label(.ZROWPFETCH)                 // row-stored pre-fetching on c // not used
-
-    jmp(.ZPOSTPFETCH)                  // jump to end of pre-fetching c
-    label(.ZCOLPFETCH)                 // column-stored pre-fetching c
-
-    mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-    lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(dt)
-
-    label(.ZPOSTPFETCH)                // done prefetching c
-
-    mov(var(k_iter), rsi)              // i = k_iter;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.ZCONSIDKLEFT)                  // if i == 0, jump to code that
-                                       // contains the k_left loop.
-
-    label(.ZLOOPKITER)                 // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-
-    vmovupd(mem(rbx,  0*32), ymm0)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-
-    vbroadcastsd(mem(rax, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-
-    add(r9, rax)                       // a += cs_a;
-
-    // ---------------------------------- iteration 1
-
-    vmovupd(mem(rbx,  0*32), ymm0)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-
-    vbroadcastsd(mem(rax, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-
-    add(r9, rax)                       // a += cs_a;
-
-    // ---------------------------------- iteration 2
-
-    vmovupd(mem(rbx,  0*32), ymm0)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-
-    vbroadcastsd(mem(rax, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-
-    add(r9, rax)                       // a += cs_a;
-
-    // ---------------------------------- iteration 3
-    vmovupd(mem(rbx, 0*32), ymm0)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-
-    vbroadcastsd(mem(rax, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-
-    add(r9, rax)                       // a += cs_a;
-
-    dec(rsi)                           // i -= 1;
-    jne(.ZLOOPKITER)                   // iterate again if i != 0.
-
-    label(.ZCONSIDKLEFT)
-
-    mov(var(k_left), rsi)              // i = k_left;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.ZPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left loop.
-
-    label(.ZLOOPKLEFT)                 // EDGE LOOP
-
-    vmovupd(mem(rbx,  0*32), ymm0)
-    add(r10, rbx)                      // b += rs_b;
-
-    vbroadcastsd(mem(rax        ), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm4)
-
-    vbroadcastsd(mem(rax, r8, 1), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm8)
-
-    vbroadcastsd(mem(rax, r8,  2), ymm2)
-    vfmadd231pd(ymm0, ymm2, ymm12)
-
-    vbroadcastsd(mem(rax, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm6)
-
-    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm10)
-
-    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-    vfmadd231pd(ymm0, ymm3, ymm14)
-
-    add(r9, rax)                       // a += cs_a;
-
-    dec(rsi)                           // i -= 1;
-    jne(.ZLOOPKLEFT)                   // iterate again if i != 0.
-
-    label(.ZPOSTACCUM)
-
-    mov(r12, rcx)                      // reset rcx to current utile of c.
-
-    // permute even and odd elements
-     // of ymm6/7, ymm10/11, ymm/14/15
-    vpermilpd(imm(0x5), ymm6, ymm6)
-    vpermilpd(imm(0x5), ymm10, ymm10)
-    vpermilpd(imm(0x5), ymm14, ymm14)
-
-    // subtract/add even/odd elements
-    vaddsubpd(ymm6, ymm4, ymm4)
-    vaddsubpd(ymm10, ymm8, ymm8)
-    vaddsubpd(ymm14, ymm12, ymm12)
-
-    /* (ar + ai) x AB */
-    mov(var(alpha), rax) // load address of alpha
-    vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
-    vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
-
-    vpermilpd(imm(0x5), ymm4, ymm3)
-    vmulpd(ymm0, ymm4, ymm4)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm4, ymm4)
-
-    vpermilpd(imm(0x5), ymm8, ymm3)
-    vmulpd(ymm0, ymm8, ymm8)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm8, ymm8)
-
-    vpermilpd(imm(0x5), ymm12, ymm3)
-    vmulpd(ymm0, ymm12, ymm12)
-    vmulpd(ymm1, ymm3, ymm3)
-    vaddsubpd(ymm3, ymm12, ymm12)
-
-    /* (br + bi)x C + ((ar + ai) x AB) */
-    mov(var(beta), rbx) // load address of beta
-    vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
-    vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
-
-     // now avoid loading C if beta == 0
-    vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
-    vucomisd(xmm0, xmm1) // set ZF if beta_r == 0.
-    sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 );
-    vucomisd(xmm0, xmm2) // set ZF if beta_i == 0.
-    sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 );
-    and(r13b, r15b) // set ZF if r13b & r15b == 1.
-    jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-    cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
-    jz(.ZCOLSTORED)                    // jump to column storage case
-
-    label(.ZROWSTORED)
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ
-    vaddpd(ymm4, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    add(rdi, rcx) // rcx = c + 1*rs_c
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ
-    vaddpd(ymm8, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    add(rdi, rcx) // rcx = c + 2*rs_c
-
-    ZGEMM_INPUT_SCALE_RS_BETA_NZ
-    vaddpd(ymm12, ymm0, ymm0)
-    ZGEMM_OUTPUT_RS
-
-    jmp(.ZDONE)                        // jump to end.
-
-    label(.ZCOLSTORED)
-    /*|--------|           |-------|
-      |        |           |       |
-      |    3x2 |           |  2x3  |
-      |--------|           |-------|
-    */
-
-    mov(var(cs_c), rsi)        // load cs_c
-    lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
-    lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm4, ymm0, ymm4)
-
-    add(rdi, rcx)
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm8, ymm0, ymm8)
-    add(rdi, rcx)
-
-    ZGEMM_INPUT_SCALE_CS_BETA_NZ
-    vaddpd(ymm12, ymm0, ymm12)
-
-    mov(r12, rcx)                      // reset rcx to current utile of c.
-
-    /****3x2 tile going to save into 2x3 tile in C*****/
-
-    /******************Transpose top tile 2x3***************************/
-    vmovups(xmm4, mem(rcx))
-    vmovups(xmm8, mem(rcx, 16))
-    vmovups(xmm12, mem(rcx,32))
-
-    add(rsi, rcx)
-
-    vextractf128(imm(0x1), ymm4, xmm4)
-    vextractf128(imm(0x1), ymm8, xmm8)
-    vextractf128(imm(0x1), ymm12, xmm12)
-    vmovups(xmm4, mem(rcx))
-    vmovups(xmm8, mem(rcx, 16))
-    vmovups(xmm12, mem(rcx,32))
-
-
-    jmp(.ZDONE)                        // jump to end.
-
-    label(.ZBETAZERO)
-
-    cmp(imm(16), rdi)                   // set ZF if (8*rs_c) == 8.
-    jz(.ZCOLSTORBZ)                    // jump to column storage case
-
-    label(.ZROWSTORBZ)
-
-    vmovupd(ymm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovupd(ymm8, mem(rcx))
-    add(rdi, rcx)
-
-    vmovupd(ymm12, mem(rcx))
-
-    jmp(.ZDONE)                        // jump to end.
-
-    label(.ZCOLSTORBZ)
-
-    /****3x2 tile going to save into 2x3 tile in C*****/
-    mov(var(cs_c), rsi)        // load cs_c
-    lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
-    lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-
-    /******************Transpose tile 3x2***************************/
-    vmovups(xmm4, mem(rcx))
-    vmovups(xmm8, mem(rcx, 16))
-    vmovups(xmm12, mem(rcx,32))
-
-    add(rsi, rcx)
-
-    vextractf128(imm(0x1), ymm4, xmm4)
-    vextractf128(imm(0x1), ymm8, xmm8)
-    vextractf128(imm(0x1), ymm12, xmm12)
-    vmovups(xmm4, mem(rcx))
-    vmovups(xmm8, mem(rcx, 16))
-    vmovups(xmm12, mem(rcx,32))
-
-    label(.ZDONE)
-
-    lea(mem(r12, rdi, 2), r12)
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)
-    lea(mem(r14, r8,  1), r14)         //a_ii = r14 += 3*rs_a
-
-    dec(r11)                           // ii -= 1;
-    jne(.ZLOOP3X2I)                    // iterate again if ii != 0.
-
-    label(.ZRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 4;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        dcomplex*  cij = c + i_edge*rs_c;
-        dcomplex*  ai  = a + i_edge*rs_a;
-        dcomplex*  bj  = b;
-
-        zgemmsup_ker_ft ker_fps[3] =
-        {
-          NULL,
-          bli_zgemmsup_rv_zen_asm_1x2,
-          bli_zgemmsup_rv_zen_asm_2x2,
-        };
-
-        zgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
-
-        ker_fp
-        (
-          conja, conjb, m_left, nr_cur, k0,
-          alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-          beta, cij, rs_c0, cs_c0, data, cntx
-        );
-        return;
-    }
-}
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/* Assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
+   and store outputs to ymm0
+   (creal,cimag)*(betar,beati) where c is stored in col major order*/
+#define ZGEMM_INPUT_SCALE_CS_BETA_NZ \
+    vmovupd(mem(rcx), xmm0) \
+    vmovupd(mem(rcx, rsi, 1), xmm3) \
+    vinsertf128(imm(1), xmm3, ymm0, ymm0) \
+    vpermilpd(imm(0x5), ymm0, ymm3) \
+    vmulpd(ymm1, ymm0, ymm0) \
+    vmulpd(ymm2, ymm3, ymm3) \
+    vaddsubpd(ymm3, ymm0, ymm0)
+
+//(creal,cimag)*(betar,beati) where c is stored in row major order
+#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \
+    vmovupd(mem(rcx), ymm0) \
+    vpermilpd(imm(0x5), ymm0, ymm3) \
+    vmulpd(ymm1, ymm0, ymm0) \
+    vmulpd(ymm2, ymm3, ymm3) \
+    vaddsubpd(ymm3, ymm0, ymm0)
+
+#define ZGEMM_INPUT_RS_BETA_ONE \
+    vmovupd(mem(rcx), ymm0)
+
+#define ZGEMM_OUTPUT_RS \
+    vmovupd(ymm0, mem(rcx)) \
+
+/*(cNextRowreal,cNextRowimag)*(betar,beati)
+   where c is stored in row major order
+   rsi = cs_c * sizeof((real +imag)dt)*numofElements
+   numofElements = 2, 2 elements are processed at a time*/
+#define ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT \
+    vmovupd(mem(rcx, rsi, 1), ymm0) \
+    vpermilpd(imm(0x5), ymm0, ymm3) \
+    vmulpd(ymm1, ymm0, ymm0) \
+    vmulpd(ymm2, ymm3, ymm3) \
+    vaddsubpd(ymm3, ymm0, ymm0)
+
+#define ZGEMM_INPUT_RS_BETA_ONE_NEXT \
+    vmovupd(mem(rcx, rsi, 1), ymm0)
+
+#define ZGEMM_OUTPUT_RS_NEXT \
+    vmovupd(ymm0, mem(rcx, rsi, 1))
+
+/*
+   rrr:
+     --------        ------        --------
+     --------   +=   ------ ...    --------
+     --------        ------        --------
+     --------        ------            :
+
+   rcr:
+     --------        | | | |       --------
+     --------   +=   | | | | ...   --------
+     --------        | | | |       --------
+     --------        | | | |           :
+
+   Assumptions:
+   - B is row-stored;
+   - A is row- or column-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential kernel is well-suited for contiguous
+   (v)ector loads on B and single-element broadcasts from A.
+
+   NOTE: These kernels explicitly support column-oriented IO, implemented
+   via an in-register transpose. And thus they also support the crr and
+   ccr cases, though only crr is ever utilized (because ccr is handled by
+   transposing the operation and executing rcr, which does not incur the
+   cost of the in-register transpose).
+
+   crr:
+     | | | | | | | |       ------        --------
+     | | | | | | | |  +=   ------
+     --------
+     | | | | | | | |       ------        --------
+     | | | | | | | |       ------            :
+*/
+void bli_zgemmsup_rv_zen_asm_3x4m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*   restrict data,
+       cntx_t*      restrict cntx
+     )
+{
+    uint64_t n_left = n0 % 4;
+
+    // First check whether this is a edge case in the n dimension. If so,
+    // dispatch other 3x?m kernels, as needed.
+    if (n_left )
+    {
+        dcomplex*  cij = c;
+        dcomplex*  bj  = b;
+        dcomplex*  ai  = a;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_zgemmsup_rv_zen_asm_3x2m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_zgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, m0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+
+        return;
+    }
+
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    char alpha_mul_type = BLIS_MUL_DEFAULT;
+    char beta_mul_type  = BLIS_MUL_DEFAULT;
+
+    //handling case when alpha and beta are real and +/-1.
+
+    if(alpha->imag == 0.0)// (alpha is real)
+    {
+        if(alpha->real == 1.0)          alpha_mul_type = BLIS_MUL_ONE;
+        else if(alpha->real == -1.0)    alpha_mul_type = BLIS_MUL_MINUS_ONE;
+        else if(alpha->real == 0.0)     alpha_mul_type = BLIS_MUL_ZERO;
+    }
+
+    if(beta->imag == 0.0)// (beta is real)
+    {
+        if(beta->real == 1.0)       beta_mul_type = BLIS_MUL_ONE;
+        else if(beta->real == -1.0) beta_mul_type = BLIS_MUL_MINUS_ONE;
+        else if(beta->real == 0.0)  beta_mul_type = BLIS_MUL_ZERO;
+    }
+
+    // -------------------------------------------------------------------------
+
+    begin_asm()
+
+    mov(var(a), r14)                   // load address of a.
+    mov(var(rs_a), r8)                 // load rs_a
+    mov(var(cs_a), r9)                 // load cs_a
+    lea(mem(, r8, 8), r8)              // rs_a *= sizeof(real dt)
+    lea(mem(, r8, 2), r8)              // rs_a *= sizeof((real + imag) dt)
+    lea(mem(, r9, 8), r9)              // cs_a *= sizeof( real dt)
+    lea(mem(, r9, 2), r9)              // cs_a *= sizeof((real + imag) dt)
+
+    mov(var(rs_b), r10)                // load rs_b
+    lea(mem(, r10, 8), r10)            // rs_b *= sizeof(real dt)
+    lea(mem(, r10, 2), r10)            // rs_b *= sizeof((real +imag) dt)
+
+                                       // NOTE: We cannot pre-load elements of a or b
+                                       // because it could eventually, in the last
+                                       // unrolled iter or the cleanup loop, result
+                                       // in reading beyond the bounds allocated mem
+                                       // (the likely result: a segmentation fault).
+
+    mov(var(c), r12)                   // load address of c
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(dt)
+    lea(mem(, rdi, 2), rdi)            // rs_c *= sizeof(dt)
+
+    // During preamble and loops:
+    // r12 = rcx = c
+    // r14 = rax = a
+    // read rbx from var(b) near beginning of loop
+    // r11 = m dim index ii
+
+    mov(var(m_iter), r11)              // ii = m_iter;
+
+    label(.ZLOOP3X4I)                 // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+    vzeroall()                         // zero all xmm/ymm registers.
+
+    mov(var(b), rbx)                   // load address of b.
+    mov(r14, rax)                      // reset rax to current upanel of a.
+
+    cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
+    jz(.ZCOLPFETCH)                    // jump to column storage case
+    label(.ZROWPFETCH)                 // row-stored pre-fetching on c // not used
+
+    jmp(.ZPOSTPFETCH)                  // jump to end of pre-fetching c
+    label(.ZCOLPFETCH)                 // column-stored pre-fetching c
+
+    mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
+    lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(dt)
+    label(.ZPOSTPFETCH)                // done prefetching c
+
+    mov(var(k_iter), rsi)              // i = k_iter;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.ZCONSIDKLEFT)                  // if i == 0, jump to code that
+                                       // contains the k_left loop.
+
+    label(.ZLOOPKITER)                 // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+
+    vmovupd(mem(rbx,  0*32), ymm0)
+    vmovupd(mem(rbx,  1*32), ymm1)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+    vfmadd231pd(ymm1, ymm2, ymm5)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+    vfmadd231pd(ymm1, ymm2, ymm9)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+    vfmadd231pd(ymm1, ymm2, ymm13)
+
+    vbroadcastsd(mem(rax, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+    vfmadd231pd(ymm1, ymm3, ymm7)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+    vfmadd231pd(ymm1, ymm3, ymm15)
+
+    add(r9, rax)                       // a += cs_a;
+
+    // ---------------------------------- iteration 1
+
+    vmovupd(mem(rbx,  0*32), ymm0)
+    vmovupd(mem(rbx,  1*32), ymm1)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+    vfmadd231pd(ymm1, ymm2, ymm5)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+    vfmadd231pd(ymm1, ymm2, ymm9)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+    vfmadd231pd(ymm1, ymm2, ymm13)
+
+    vbroadcastsd(mem(rax, 8    ), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+    vfmadd231pd(ymm1, ymm3, ymm7)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+    vfmadd231pd(ymm1, ymm3, ymm15)
+
+    add(r9, rax)                       // a += cs_a;
+
+    // ---------------------------------- iteration 2
+
+    vmovupd(mem(rbx,  0*32), ymm0)
+    vmovupd(mem(rbx,  1*32), ymm1)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+    vfmadd231pd(ymm1, ymm2, ymm5)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+    vfmadd231pd(ymm1, ymm2, ymm9)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+    vfmadd231pd(ymm1, ymm2, ymm13)
+
+    vbroadcastsd(mem(rax, 8 ), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+    vfmadd231pd(ymm1, ymm3, ymm7)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+    vfmadd231pd(ymm1, ymm3, ymm15)
+
+    add(r9, rax)                       // a += cs_a;
+
+    // ---------------------------------- iteration 3
+    vmovupd(mem(rbx, 0*32), ymm0)
+    vmovupd(mem(rbx, 1*32), ymm1)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+    vfmadd231pd(ymm1, ymm2, ymm5)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+    vfmadd231pd(ymm1, ymm2, ymm9)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+    vfmadd231pd(ymm1, ymm2, ymm13)
+
+    vbroadcastsd(mem(rax, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+    vfmadd231pd(ymm1, ymm3, ymm7)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+    vfmadd231pd(ymm1, ymm3, ymm15)
+
+    add(r9, rax)                       // a += cs_a;
+
+    dec(rsi)                           // i -= 1;
+    jne(.ZLOOPKITER)                   // iterate again if i != 0.
+
+    label(.ZCONSIDKLEFT)
+
+    mov(var(k_left), rsi)              // i = k_left;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.ZPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left loop.
+
+    label(.ZLOOPKLEFT)                 // EDGE LOOP
+
+    vmovupd(mem(rbx,  0*32), ymm0)
+    vmovupd(mem(rbx,  1*32), ymm1)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+    vfmadd231pd(ymm1, ymm2, ymm5)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+    vfmadd231pd(ymm1, ymm2, ymm9)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+    vfmadd231pd(ymm1, ymm2, ymm13)
+
+    vbroadcastsd(mem(rax, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+    vfmadd231pd(ymm1, ymm3, ymm7)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+    vfmadd231pd(ymm1, ymm3, ymm11)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+    vfmadd231pd(ymm1, ymm3, ymm15)
+
+    add(r9, rax)                       // a += cs_a;
+
+    dec(rsi)                           // i -= 1;
+    jne(.ZLOOPKLEFT)                   // iterate again if i != 0.
+
+    label(.ZPOSTACCUM)
+
+    mov(r12, rcx)                      // reset rcx to current utile of c.
+
+    // permute even and odd elements
+     // of ymm6/7, ymm10/11, ymm/14/15
+    vpermilpd(imm(0x5), ymm6, ymm6)
+    vpermilpd(imm(0x5), ymm7, ymm7)
+    vpermilpd(imm(0x5), ymm10, ymm10)
+    vpermilpd(imm(0x5), ymm11, ymm11)
+    vpermilpd(imm(0x5), ymm14, ymm14)
+    vpermilpd(imm(0x5), ymm15, ymm15)
+
+     // subtract/add even/odd elements
+    vaddsubpd(ymm6, ymm4, ymm4)
+    vaddsubpd(ymm7, ymm5, ymm5)
+
+    vaddsubpd(ymm10, ymm8, ymm8)
+    vaddsubpd(ymm11, ymm9, ymm9)
+
+    vaddsubpd(ymm14, ymm12, ymm12)
+    vaddsubpd(ymm15, ymm13, ymm13)
+
+    mov(var(cs_c), rsi)        // load cs_c
+    lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
+    lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real +imag)dt)
+
+    //if(alpha_mul_type == BLIS_MUL_MINUS_ONE)
+    mov(var(alpha_mul_type), al)
+    cmp(imm(0xFF), al)
+    jne(.ALPHA_NOT_MINUS1)
+
+    // when alpha = -1 and real.
+    vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
+    vsubpd(ymm4, ymm0, ymm4)
+    vsubpd(ymm5, ymm0, ymm5)
+    vsubpd(ymm8, ymm0, ymm8)
+    vsubpd(ymm9, ymm0, ymm9)
+    vsubpd(ymm12, ymm0, ymm12)
+    vsubpd(ymm13, ymm0, ymm13)
+    jmp(.ALPHA_REAL_ONE)
+
+    label(.ALPHA_NOT_MINUS1)
+    //when alpha is real and +/-1, multiplication is skipped.
+    cmp(imm(2), al)//if(alpha_mul_type != BLIS_MUL_DEFAULT) skip below multiplication.
+    jne(.ALPHA_REAL_ONE)
+
+    /* (ar + ai) x AB */
+    mov(var(alpha), rax)             // load address of alpha
+    vbroadcastsd(mem(rax), ymm0)     // load alpha_r and duplicate
+    vbroadcastsd(mem(rax, 8), ymm1)  // load alpha_i and duplicate
+
+    vpermilpd(imm(0x5), ymm4, ymm3)
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm4, ymm4)
+
+    vpermilpd(imm(0x5), ymm5, ymm3)
+    vmulpd(ymm0, ymm5, ymm5)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm5, ymm5)
+
+    vpermilpd(imm(0x5), ymm8, ymm3)
+    vmulpd(ymm0, ymm8, ymm8)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm8, ymm8)
+
+    vpermilpd(imm(0x5), ymm9, ymm3)
+    vmulpd(ymm0, ymm9, ymm9)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm9, ymm9)
+
+    vpermilpd(imm(0x5), ymm12, ymm3)
+    vmulpd(ymm0, ymm12, ymm12)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm12, ymm12)
+
+    vpermilpd(imm(0x5), ymm13, ymm3)
+    vmulpd(ymm0, ymm13, ymm13)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm13, ymm13)
+
+    label(.ALPHA_REAL_ONE)
+    // Beta multiplication
+    /* (br + bi)x C + ((ar + ai) x AB) */
+
+    mov(var(beta_mul_type), al)
+    cmp(imm(0), al)                    //if(beta_mul_type == BLIS_MUL_ZERO)
+    je(.ZBETAZERO)                     //jump to beta == 0 case
+
+    cmp(imm(16), rdi)                  // set ZF if (16*rs_c) ==16.
+    jz(.ZCOLSTORED)                    // jump to column storage case
+
+    label(.ZROWSTORED)
+
+    lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real +imag)dt) * numofElements
+
+    cmp(imm(2), al)                    // if(beta_mul_type == BLIS_MUL_DEFAULT)
+    je(.ROW_BETA_NOT_REAL_ONE)         // jump to beta handling with multiplication.
+
+    cmp(imm(0xFF), al)                 // if(beta_mul_type == BLIS_MUL_MINUS_ONE)
+    je(.ROW_BETA_REAL_MINUS1)          // jump to beta real = -1 section.
+
+    //CASE 1: beta is real = 1
+    ZGEMM_INPUT_RS_BETA_ONE
+    vaddpd(ymm4, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_RS_BETA_ONE_NEXT
+    vaddpd(ymm5, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    add(rdi, rcx) // rcx = c + 1*rs_c
+
+    ZGEMM_INPUT_RS_BETA_ONE
+    vaddpd(ymm8, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_RS_BETA_ONE_NEXT
+    vaddpd(ymm9, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    add(rdi, rcx) // rcx = c + 2*rs_c
+
+    ZGEMM_INPUT_RS_BETA_ONE
+    vaddpd(ymm12, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_RS_BETA_ONE_NEXT
+    vaddpd(ymm13, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    jmp(.ZDONE)
+
+
+    //CASE 2: beta is real = -1
+    label(.ROW_BETA_REAL_MINUS1)
+    ZGEMM_INPUT_RS_BETA_ONE
+    vsubpd(ymm0, ymm4, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_RS_BETA_ONE_NEXT
+    vsubpd(ymm0, ymm5, ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    add(rdi, rcx) // rcx = c + 1*rs_c
+
+    ZGEMM_INPUT_RS_BETA_ONE
+    vsubpd(ymm0, ymm8, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_RS_BETA_ONE_NEXT
+    vsubpd(ymm0, ymm9, ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    add(rdi, rcx) // rcx = c + 2*rs_c
+
+    ZGEMM_INPUT_RS_BETA_ONE
+    vsubpd(ymm0, ymm12, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_RS_BETA_ONE_NEXT
+    vsubpd(ymm0, ymm13,  ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    jmp(.ZDONE)
+
+
+    //CASE 3: Default case with multiplication
+    // beta not equal to (+/-1) or zero, do normal multiplication.
+    label(.ROW_BETA_NOT_REAL_ONE)
+    mov(var(beta), rbx)             // load address of beta
+    vbroadcastsd(mem(rbx), ymm1)    // load beta_r and duplicate
+    vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ
+    vaddpd(ymm4, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
+    vaddpd(ymm5, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    add(rdi, rcx) // rcx = c + 1*rs_c
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ
+    vaddpd(ymm8, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
+    vaddpd(ymm9, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    add(rdi, rcx) // rcx = c + 2*rs_c
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ
+    vaddpd(ymm12, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
+    vaddpd(ymm13, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS_NEXT
+    jmp(.ZDONE)                        // jump to end.
+
+    label(.ZCOLSTORED)
+    mov(var(beta), rbx)              // load address of beta
+    vbroadcastsd(mem(rbx), ymm1)     // load beta_r and duplicate
+    vbroadcastsd(mem(rbx, 8), ymm2)  // load beta_i and duplicate
+    /*|--------|           |-------|
+      |        |           |       |
+      |    3x4 |           |  4x3  |
+      |--------|           |-------|
+    */
+
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm4, ymm0, ymm4)
+
+    add(rdi, rcx)
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm8, ymm0, ymm8)
+    add(rdi, rcx)
+
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm12, ymm0, ymm12)
+
+    lea(mem(r12, rsi, 2), rcx)
+
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm5, ymm0, ymm5)
+    add(rdi, rcx)
+
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm9, ymm0, ymm9)
+    add(rdi, rcx)
+
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm13, ymm0, ymm13)
+
+    mov(r12, rcx)                      // reset rcx to current utile of c.
+
+
+    /****3x4 tile going to save into 4x3 tile in C*****/
+
+    /******************Transpose top tile 4x3***************************/
+    vmovups(xmm4, mem(rcx))
+    vmovups(xmm8, mem(rcx, 16))
+    vmovups(xmm12, mem(rcx,32))
+
+    add(rsi, rcx)
+
+    vextractf128(imm(0x1), ymm4, xmm4)
+    vextractf128(imm(0x1), ymm8, xmm8)
+    vextractf128(imm(0x1), ymm12, xmm12)
+    vmovups(xmm4, mem(rcx))
+    vmovups(xmm8, mem(rcx, 16))
+    vmovups(xmm12, mem(rcx,32))
+
+    add(rsi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    vmovups(xmm9, mem(rcx, 16))
+    vmovups(xmm13,mem(rcx,32))
+
+    add(rsi, rcx)
+
+    vextractf128(imm(0x1), ymm5, xmm5)
+    vextractf128(imm(0x1), ymm9, xmm9)
+    vextractf128(imm(0x1), ymm13, xmm13)
+    vmovups(xmm5, mem(rcx))
+    vmovups(xmm9, mem(rcx, 16))
+    vmovups(xmm13,mem(rcx,32))
+
+    jmp(.ZDONE)                        // jump to end.
+
+    label(.ZBETAZERO)
+    cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
+    jz(.ZCOLSTORBZ)                    // jump to column storage case
+
+    label(.ZROWSTORBZ)
+    /* Store 3x4 elements to C matrix where is C row major order*/
+
+    // rsi = cs_c * sizeof((real +imag)dt) *numofElements
+    lea(mem(, rsi, 2), rsi)
+
+    vmovupd(ymm4, mem(rcx))
+    vmovupd(ymm5, mem(rcx, rsi, 1))
+    add(rdi, rcx)
+
+    vmovupd(ymm8, mem(rcx))
+    vmovupd(ymm9, mem(rcx, rsi, 1))
+    add(rdi, rcx)
+
+    vmovupd(ymm12, mem(rcx))
+    vmovupd(ymm13, mem(rcx, rsi, 1))
+
+    jmp(.ZDONE)                        // jump to end.
+
+    label(.ZCOLSTORBZ)
+
+    /****3x4 tile going to save into 4x3 tile in C*****/
+
+    /******************Transpose top tile 4x3***************************/
+    vmovups(xmm4, mem(rcx))
+    vmovups(xmm8, mem(rcx, 16))
+    vmovups(xmm12, mem(rcx,32))
+
+    add(rsi, rcx)
+
+    vextractf128(imm(0x1), ymm4, xmm4)
+    vextractf128(imm(0x1), ymm8, xmm8)
+    vextractf128(imm(0x1), ymm12, xmm12)
+    vmovups(xmm4, mem(rcx))
+    vmovups(xmm8, mem(rcx, 16))
+    vmovups(xmm12, mem(rcx,32))
+
+    add(rsi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    vmovups(xmm9, mem(rcx, 16))
+    vmovups(xmm13,mem(rcx,32))
+
+    add(rsi, rcx)
+
+    vextractf128(imm(0x1), ymm5, xmm5)
+    vextractf128(imm(0x1), ymm9, xmm9)
+    vextractf128(imm(0x1), ymm13, xmm13)
+    vmovups(xmm5, mem(rcx))
+    vmovups(xmm9, mem(rcx, 16))
+    vmovups(xmm13,mem(rcx,32))
+
+    label(.ZDONE)
+
+    lea(mem(r12, rdi, 2), r12)
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)
+    lea(mem(r14, r8,  1), r14)         //a_ii = r14 += 3*rs_a
+
+    dec(r11)                           // ii -= 1;
+    jne(.ZLOOP3X4I)                    // iterate again if ii != 0.
+
+    label(.ZRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [alpha_mul_type] "m" (alpha_mul_type),
+      [beta_mul_type] "m" (beta_mul_type),
+      [m_iter] "m" (m_iter),
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 4;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        dcomplex*  cij = c + i_edge*rs_c;
+        dcomplex*  ai  = a + i_edge*rs_a;
+        dcomplex*  bj  = b;
+
+        zgemmsup_ker_ft ker_fps[3] =
+        {
+          NULL,
+          bli_zgemmsup_rv_zen_asm_1x4,
+          bli_zgemmsup_rv_zen_asm_2x4,
+        };
+
+        zgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          conja, conjb, m_left, nr_cur, k0,
+          alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+          beta, cij, rs_c0, cs_c0, data, cntx
+        );
+        return;
+
+    }
+
+}
+
+void bli_zgemmsup_rv_zen_asm_3x2m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*   restrict data,
+       cntx_t*      restrict cntx
+
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+
+    begin_asm()
+
+    mov(var(a), r14)                   // load address of a.
+    mov(var(rs_a), r8)                 // load rs_a
+    mov(var(cs_a), r9)                 // load cs_a
+    lea(mem(, r8, 8), r8)              // rs_a *= sizeof(dt)
+    lea(mem(, r8, 2), r8)              // rs_a *= sizeof(dt)
+    lea(mem(, r9, 8), r9)              // cs_a *= sizeof(dt)
+    lea(mem(, r9, 2), r9)              // cs_a *= sizeof(dt)
+
+    mov(var(rs_b), r10)                // load rs_b
+    lea(mem(, r10, 8), r10)            // rs_b *= sizeof(dt)
+    lea(mem(, r10, 2), r10)            // rs_b *= sizeof(dt)
+
+                                       // NOTE: We cannot pre-load elements of a or b
+                                       // because it could eventually, in the last
+                                       // unrolled iter or the cleanup loop, result
+                                       // in reading beyond the bounds allocated mem
+                                       // (the likely result: a segmentation fault).
+
+    mov(var(c), r12)                   // load address of c
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(dt)
+    lea(mem(, rdi, 2), rdi)            // rs_c *= sizeof(dt)
+
+    // During preamble and loops:
+    // r12 = rcx = c
+    // r14 = rax = a
+    // read rbx from var(b) near beginning of loop
+    // r11 = m dim index ii
+
+    mov(var(m_iter), r11)              // ii = m_iter;
+
+    label(.ZLOOP3X2I)                 // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+    vzeroall()                         // zero all xmm/ymm registers.
+
+    mov(var(b), rbx)                   // load address of b.
+    mov(r14, rax)                      // reset rax to current upanel of a.
+
+    cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
+    jz(.ZCOLPFETCH)                    // jump to column storage case
+    label(.ZROWPFETCH)                 // row-stored pre-fetching on c // not used
+
+    jmp(.ZPOSTPFETCH)                  // jump to end of pre-fetching c
+    label(.ZCOLPFETCH)                 // column-stored pre-fetching c
+
+    mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
+    lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(dt)
+
+    label(.ZPOSTPFETCH)                // done prefetching c
+
+    mov(var(k_iter), rsi)              // i = k_iter;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.ZCONSIDKLEFT)                  // if i == 0, jump to code that
+                                       // contains the k_left loop.
+
+    label(.ZLOOPKITER)                 // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+
+    vmovupd(mem(rbx,  0*32), ymm0)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+
+    vbroadcastsd(mem(rax, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+
+    add(r9, rax)                       // a += cs_a;
+
+    // ---------------------------------- iteration 1
+
+    vmovupd(mem(rbx,  0*32), ymm0)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+
+    vbroadcastsd(mem(rax, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+
+    add(r9, rax)                       // a += cs_a;
+
+    // ---------------------------------- iteration 2
+
+    vmovupd(mem(rbx,  0*32), ymm0)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+
+    vbroadcastsd(mem(rax, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+
+    add(r9, rax)                       // a += cs_a;
+
+    // ---------------------------------- iteration 3
+    vmovupd(mem(rbx, 0*32), ymm0)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+
+    vbroadcastsd(mem(rax, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+
+    add(r9, rax)                       // a += cs_a;
+
+    dec(rsi)                           // i -= 1;
+    jne(.ZLOOPKITER)                   // iterate again if i != 0.
+
+    label(.ZCONSIDKLEFT)
+
+    mov(var(k_left), rsi)              // i = k_left;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.ZPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left loop.
+
+    label(.ZLOOPKLEFT)                 // EDGE LOOP
+
+    vmovupd(mem(rbx,  0*32), ymm0)
+    add(r10, rbx)                      // b += rs_b;
+
+    vbroadcastsd(mem(rax        ), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm4)
+
+    vbroadcastsd(mem(rax, r8, 1), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm8)
+
+    vbroadcastsd(mem(rax, r8,  2), ymm2)
+    vfmadd231pd(ymm0, ymm2, ymm12)
+
+    vbroadcastsd(mem(rax, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm6)
+
+    vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm10)
+
+    vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+    vfmadd231pd(ymm0, ymm3, ymm14)
+
+    add(r9, rax)                       // a += cs_a;
+
+    dec(rsi)                           // i -= 1;
+    jne(.ZLOOPKLEFT)                   // iterate again if i != 0.
+
+    label(.ZPOSTACCUM)
+
+    mov(r12, rcx)                      // reset rcx to current utile of c.
+
+    // permute even and odd elements
+     // of ymm6/7, ymm10/11, ymm/14/15
+    vpermilpd(imm(0x5), ymm6, ymm6)
+    vpermilpd(imm(0x5), ymm10, ymm10)
+    vpermilpd(imm(0x5), ymm14, ymm14)
+
+    // subtract/add even/odd elements
+    vaddsubpd(ymm6, ymm4, ymm4)
+    vaddsubpd(ymm10, ymm8, ymm8)
+    vaddsubpd(ymm14, ymm12, ymm12)
+
+    /* (ar + ai) x AB */
+    mov(var(alpha), rax) // load address of alpha
+    vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
+    vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
+
+    vpermilpd(imm(0x5), ymm4, ymm3)
+    vmulpd(ymm0, ymm4, ymm4)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm4, ymm4)
+
+    vpermilpd(imm(0x5), ymm8, ymm3)
+    vmulpd(ymm0, ymm8, ymm8)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm8, ymm8)
+
+    vpermilpd(imm(0x5), ymm12, ymm3)
+    vmulpd(ymm0, ymm12, ymm12)
+    vmulpd(ymm1, ymm3, ymm3)
+    vaddsubpd(ymm3, ymm12, ymm12)
+
+    /* (br + bi)x C + ((ar + ai) x AB) */
+    mov(var(beta), rbx) // load address of beta
+    vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
+    vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
+
+     // now avoid loading C if beta == 0
+    vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
+    vucomisd(xmm0, xmm1) // set ZF if beta_r == 0.
+    sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 );
+    vucomisd(xmm0, xmm2) // set ZF if beta_i == 0.
+    sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 );
+    and(r13b, r15b) // set ZF if r13b & r15b == 1.
+    jne(.ZBETAZERO) // if ZF = 1, jump to beta == 0 case
+
+    cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
+    jz(.ZCOLSTORED)                    // jump to column storage case
+
+    label(.ZROWSTORED)
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ
+    vaddpd(ymm4, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    add(rdi, rcx) // rcx = c + 1*rs_c
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ
+    vaddpd(ymm8, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    add(rdi, rcx) // rcx = c + 2*rs_c
+
+    ZGEMM_INPUT_SCALE_RS_BETA_NZ
+    vaddpd(ymm12, ymm0, ymm0)
+    ZGEMM_OUTPUT_RS
+
+    jmp(.ZDONE)                        // jump to end.
+
+    label(.ZCOLSTORED)
+    /*|--------|           |-------|
+      |        |           |       |
+      |    3x2 |           |  2x3  |
+      |--------|           |-------|
+    */
+
+    mov(var(cs_c), rsi)        // load cs_c
+    lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
+    lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
+
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm4, ymm0, ymm4)
+
+    add(rdi, rcx)
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm8, ymm0, ymm8)
+    add(rdi, rcx)
+
+    ZGEMM_INPUT_SCALE_CS_BETA_NZ
+    vaddpd(ymm12, ymm0, ymm12)
+
+    mov(r12, rcx)                      // reset rcx to current utile of c.
+
+    /****3x2 tile going to save into 2x3 tile in C*****/
+
+    /******************Transpose top tile 2x3***************************/
+    vmovups(xmm4, mem(rcx))
+    vmovups(xmm8, mem(rcx, 16))
+    vmovups(xmm12, mem(rcx,32))
+
+    add(rsi, rcx)
+
+    vextractf128(imm(0x1), ymm4, xmm4)
+    vextractf128(imm(0x1), ymm8, xmm8)
+    vextractf128(imm(0x1), ymm12, xmm12)
+    vmovups(xmm4, mem(rcx))
+    vmovups(xmm8, mem(rcx, 16))
+    vmovups(xmm12, mem(rcx,32))
+
+
+    jmp(.ZDONE)                        // jump to end.
+
+    label(.ZBETAZERO)
+
+    cmp(imm(16), rdi)                   // set ZF if (8*rs_c) == 8.
+    jz(.ZCOLSTORBZ)                    // jump to column storage case
+
+    label(.ZROWSTORBZ)
+
+    vmovupd(ymm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm8, mem(rcx))
+    add(rdi, rcx)
+
+    vmovupd(ymm12, mem(rcx))
+
+    jmp(.ZDONE)                        // jump to end.
+
+    label(.ZCOLSTORBZ)
+
+    /****3x2 tile going to save into 2x3 tile in C*****/
+    mov(var(cs_c), rsi)        // load cs_c
+    lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
+    lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
+
+    /******************Transpose tile 3x2***************************/
+    vmovups(xmm4, mem(rcx))
+    vmovups(xmm8, mem(rcx, 16))
+    vmovups(xmm12, mem(rcx,32))
+
+    add(rsi, rcx)
+
+    vextractf128(imm(0x1), ymm4, xmm4)
+    vextractf128(imm(0x1), ymm8, xmm8)
+    vextractf128(imm(0x1), ymm12, xmm12)
+    vmovups(xmm4, mem(rcx))
+    vmovups(xmm8, mem(rcx, 16))
+    vmovups(xmm12, mem(rcx,32))
+
+    label(.ZDONE)
+
+    lea(mem(r12, rdi, 2), r12)
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)
+    lea(mem(r14, r8,  1), r14)         //a_ii = r14 += 3*rs_a
+
+    dec(r11)                           // ii -= 1;
+    jne(.ZLOOP3X2I)                    // iterate again if ii != 0.
+
+    label(.ZRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm6", "ymm8", "ymm10",
+      "ymm12", "ymm14",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 4;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        dcomplex*  cij = c + i_edge*rs_c;
+        dcomplex*  ai  = a + i_edge*rs_a;
+        dcomplex*  bj  = b;
+
+        zgemmsup_ker_ft ker_fps[3] =
+        {
+          NULL,
+          bli_zgemmsup_rv_zen_asm_1x2,
+          bli_zgemmsup_rv_zen_asm_2x2,
+        };
+
+        zgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          conja, conjb, m_left, nr_cur, k0,
+          alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+          beta, cij, rs_c0, cs_c0, data, cntx
+        );
+        return;
+    }
+}
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
index 8d10406a05..07fbd26296 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
@@ -1753,4 +1753,4 @@ void bli_cgemmsup_rv_zen_asm_3x2m
 	}
 }
 
- 
\ No newline at end of file
+ 
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
index 05e05dfece..898e4006e9 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
@@ -1,1229 +1,1229 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-// assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
-// outputs to ymm0
-#define ZGEMM_INPUT_SCALE_CS_BETA_NZ \
-	vmovupd(mem(rcx), xmm0) \
-	vmovupd(mem(rcx, rsi, 1), xmm3) \
-	vinsertf128(imm(1), xmm3, ymm0, ymm0) \
-	vpermilpd(imm(0x5), ymm0, ymm3) \
-	vmulpd(ymm1, ymm0, ymm0) \
-	vmulpd(ymm2, ymm3, ymm3) \
-	vaddsubpd(ymm3, ymm0, ymm0)
-
-#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \
-	vmovupd(mem(rcx), ymm0) \
-	vpermilpd(imm(0x5), ymm0, ymm3) \
-	vmulpd(ymm1, ymm0, ymm0) \
-	vmulpd(ymm2, ymm3, ymm3) \
-	vaddsubpd(ymm3, ymm0, ymm0)
-
-#define ZGEMM_OUTPUT_RS \
-	vmovupd(ymm0, mem(rcx)) \
-
-#define ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT \
-	vmovupd(mem(rcx, rsi, 8), ymm0) \
-	vpermilpd(imm(0x5), ymm0, ymm3) \
-	vmulpd(ymm1, ymm0, ymm0) \
-	vmulpd(ymm2, ymm3, ymm3) \
-	vaddsubpd(ymm3, ymm0, ymm0)
-
-#define ZGEMM_OUTPUT_RS_NEXT \
-	vmovupd(ymm0, mem(rcx, rsi, 8)) \
-
-/*
-   rrr:
-	 --------        ------        --------
-	 --------   +=   ------ ...    --------
-	 --------        ------        --------
-	 --------        ------            :
-
-   rcr:
-	 --------        | | | |       --------
-	 --------   +=   | | | | ...   --------
-	 --------        | | | |       --------
-	 --------        | | | |           :
-
-   Assumptions:
-   - B is row-stored;
-   - A is row- or column-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential kernel is well-suited for contiguous
-   (v)ector loads on B and single-element broadcasts from A.
-
-   NOTE: These kernels explicitly support column-oriented IO, implemented
-   via an in-register transpose. And thus they also support the crr and
-   ccr cases, though only crr is ever utilized (because ccr is handled by
-   transposing the operation and executing rcr, which does not incur the
-   cost of the in-register transpose).
-
-   crr:
-	 | | | | | | | |       ------        --------
-	 | | | | | | | |  +=   ------ 
-	 --------
-	 | | | | | | | |       ------        --------
-	 | | | | | | | |       ------            :
-*/
-void bli_zgemmsup_rv_zen_asm_3x4m
-     (
-       conj_t       conja,
-       conj_t       conjb,
-       dim_t        m0,
-       dim_t        n0,
-       dim_t        k0,
-       dcomplex*    restrict alpha,
-       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       dcomplex*    restrict beta,
-       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
-     )
-{
-	uint64_t n_left = n0 % 4;
-
-	// First check whether this is a edge case in the n dimension. If so,
-	// dispatch other 3x?m kernels, as needed.
-	if (n_left )
-	{
-        dcomplex*  cij = c;
-        dcomplex*  bj  = b;
-        dcomplex*  ai  = a;
-
-		if ( 2 <= n_left )
-		{
-			const dim_t nr_cur = 2;
-
-			bli_zgemmsup_rv_zen_asm_3x2m
-			(
-			  conja, conjb, m0, nr_cur, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-			  beta, cij, rs_c0, cs_c0, data, cntx
-			);
-			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-		}
-		if ( 1 == n_left )
-		{
-			bli_zgemv_ex
-			(
-			  BLIS_NO_TRANSPOSE, conjb, m0, k0,
-			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-			  beta, cij, rs_c0, cntx, NULL
-			);
-		}
-
-		return;
-	}
-
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(real dt)
-	lea(mem(, r8, 2), r8)              // rs_a *= sizeof((real + imag) dt)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof( real dt)
-	lea(mem(, r9, 2), r9)              // cs_a *= sizeof((real + imag) dt)
-
-	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-
-	mov(var(rs_b), r10)                // load rs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(real dt)
-	lea(mem(, r10, 2), r10)            // rs_b *= sizeof((real +imag) dt)
-
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(dt)
-	lea(mem(, rdi, 2), rdi)            // rs_c *= sizeof(dt)
-
-	// During preamble and loops:
-	// r12 = rcx = c
-	// r14 = rax = a
-	// read rbx from var(b) near beginning of loop
-	// r11 = m dim index ii
-
-	mov(var(m_iter), r11)              // ii = m_iter;
-
-	label(.SLOOP3X8I)                 // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-	vzeroall()                         // zero all xmm/ymm registers.
-
-	mov(var(b), rbx)                   // load address of b.
-	mov(r14, rax)                      // reset rax to current upanel of a.
-
-	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
-	jz(.SCOLPFETCH)                    // jump to column storage case
-	label(.SROWPFETCH)                 // row-stored pre-fetching on c // not used
-
-	lea(mem(r12, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-	jmp(.SPOSTPFETCH)                  // jump to end of pre-fetching c
-	label(.SCOLPFETCH)                 // column-stored pre-fetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(dt)
-	lea(mem(r12, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-
-	label(.SPOSTPFETCH)                // done prefetching c
-
-	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
-	lea(mem(rax, r8,  4), rdx)         // use rdx for pre-fetching lines
-	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
-
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-
-	label(.SLOOPKITER)                 // MAIN LOOP
-
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-
-	vbroadcastsd(mem(rax, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	add(r9, rax)                       // a += cs_a;
-
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-
-	vbroadcastsd(mem(rax, 8    ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	add(r9, rax)                       // a += cs_a;
-
-	// ---------------------------------- iteration 2
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-
-	vbroadcastsd(mem(rax, 8 ), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	add(r9, rax)                       // a += cs_a;
-
-	// ---------------------------------- iteration 3
-	lea(mem(rdx, r9,  4), rdx)         // a_prefetch += 4*cs_a;
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	vmovupd(mem(rbx, 1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-
-	vbroadcastsd(mem(rax, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	add(r9, rax)                       // a += cs_a;
-
-	dec(rsi)                           // i -= 1;
-	jne(.SLOOPKITER)                   // iterate again if i != 0.
-
-	label(.SCONSIDKLEFT)
-
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-
-	label(.SLOOPKLEFT)                 // EDGE LOOP
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	vmovupd(mem(rbx,  1*32), ymm1)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-	vfmadd231pd(ymm1, ymm2, ymm5)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-	vfmadd231pd(ymm1, ymm2, ymm9)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-	vfmadd231pd(ymm1, ymm2, ymm13)
-
-	vbroadcastsd(mem(rax, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-	vfmadd231pd(ymm1, ymm3, ymm7)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-	vfmadd231pd(ymm1, ymm3, ymm11)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-	vfmadd231pd(ymm1, ymm3, ymm15)
-
-	add(r9, rax)                       // a += cs_a;
-
-	dec(rsi)                           // i -= 1;
-	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-
-	label(.SPOSTACCUM)
-
-	mov(r12, rcx)                      // reset rcx to current utile of c.
-
-	// permute even and odd elements
-	 // of ymm6/7, ymm10/11, ymm/14/15
-	vpermilpd(imm(0x5), ymm6, ymm6)
-	vpermilpd(imm(0x5), ymm7, ymm7)
-	vpermilpd(imm(0x5), ymm10, ymm10)
-	vpermilpd(imm(0x5), ymm11, ymm11)
-	vpermilpd(imm(0x5), ymm14, ymm14)
-	vpermilpd(imm(0x5), ymm15, ymm15)
-
-	 // subtract/add even/odd elements
-	vaddsubpd(ymm6, ymm4, ymm4)
-	vaddsubpd(ymm7, ymm5, ymm5)
-
-	vaddsubpd(ymm10, ymm8, ymm8)
-	vaddsubpd(ymm11, ymm9, ymm9)
-
-	vaddsubpd(ymm14, ymm12, ymm12)
-	vaddsubpd(ymm15, ymm13, ymm13)
-
-	/* (ar + ai) x AB */
-	mov(var(alpha), rax) // load address of alpha
-	vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
-	vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
-
-	vpermilpd(imm(0x5), ymm4, ymm3)
-	vmulpd(ymm0, ymm4, ymm4)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm4, ymm4)
-
-	vpermilpd(imm(0x5), ymm5, ymm3)
-	vmulpd(ymm0, ymm5, ymm5)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm5, ymm5)
-
-	vpermilpd(imm(0x5), ymm8, ymm3)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm8, ymm8)
-
-	vpermilpd(imm(0x5), ymm9, ymm3)
-	vmulpd(ymm0, ymm9, ymm9)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm9, ymm9)
-
-	vpermilpd(imm(0x5), ymm12, ymm3)
-	vmulpd(ymm0, ymm12, ymm12)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm12, ymm12)
-
-	vpermilpd(imm(0x5), ymm13, ymm3)
-	vmulpd(ymm0, ymm13, ymm13)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm13, ymm13)
-
-	/* (�r + �i)x C + ((ar + ai) x AB) */
-	mov(var(beta), rbx) // load address of beta
-	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
-	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
-
-	mov(var(cs_c), rsi)        // load cs_c
-	lea(mem(, rsi, 4), rsi)    // rsi = cs_c * sizeof(dt)
-
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-
-	// now avoid loading C if beta == 0
-	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
-	vucomisd(xmm0, xmm1) // set ZF if beta_r == 0.
-	sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 );
-	vucomisd(xmm0, xmm2) // set ZF if beta_i == 0.
-	sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 );
-	and(r13b, r15b) // set ZF if r13b & r15b == 1.
-	jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-
-	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) ==16.
-	jz(.SCOLSTORED)                    // jump to column storage case
-
-	label(.SROWSTORED)
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm4, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
-	vaddpd(ymm5, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS_NEXT
-	add(rdi, rcx) // rcx = c + 1*rs_c
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm8, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
-	vaddpd(ymm9, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS_NEXT
-	add(rdi, rcx) // rcx = c + 2*rs_c
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm12, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
-	vaddpd(ymm13, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS_NEXT
-
-	jmp(.SDONE)                        // jump to end.
-
-	label(.SCOLSTORED)
-	/*|--------|           |-------|
-	  |        |           |       |
-	  |    3x4 |           |  4x3  |
-	  |--------|           |-------|
-	*/
-
-	mov(var(cs_c), rsi)        // load cs_c
-	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
-	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real +imag)dt)
-	lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a
-
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm4, ymm0, ymm4)
-
-	add(rdi, rcx)
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm8, ymm0, ymm8)
-	add(rdi, rcx)
-
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm12, ymm0, ymm12)
-
-	lea(mem(r12, rsi, 2), rcx)
-
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm5, ymm0, ymm5)
-	add(rdi, rcx)
-
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm9, ymm0, ymm9)
-	add(rdi, rcx) 
-
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm13, ymm0, ymm13)
-
-	mov(r12, rcx)                      // reset rcx to current utile of c.
-
-
-	/****3x4 tile going to save into 4x3 tile in C*****/
-	mov(var(cs_c), rsi)        // load cs_c
-	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
-	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real +imag)dt)
-
-	/******************Transpose top tile 4x3***************************/
-	vmovups(xmm4, mem(rcx))
-	vmovups(xmm8, mem(rcx, 16))
-	vmovups(xmm12, mem(rcx,32))
-
-	add(rsi, rcx)
-
-	vextractf128(imm(0x1), ymm4, xmm4)
-	vextractf128(imm(0x1), ymm8, xmm8)
-	vextractf128(imm(0x1), ymm12, xmm12)
-	vmovups(xmm4, mem(rcx))
-	vmovups(xmm8, mem(rcx, 16))
-	vmovups(xmm12, mem(rcx,32))
-
-	add(rsi, rcx)
-	
-	vmovups(xmm5, mem(rcx))
-	vmovups(xmm9, mem(rcx, 16))
-	vmovups(xmm13,mem(rcx,32))
-	
-	add(rsi, rcx)
-	
-	vextractf128(imm(0x1), ymm5, xmm5)
-	vextractf128(imm(0x1), ymm9, xmm9)
-	vextractf128(imm(0x1), ymm13, xmm13)
-	vmovups(xmm5, mem(rcx))
-	vmovups(xmm9, mem(rcx, 16))
-	vmovups(xmm13,mem(rcx,32))
-
-	jmp(.SDONE)                        // jump to end.
-
-	label(.SBETAZERO)
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
-	jz(.SCOLSTORBZ)                    // jump to column storage case
-
-	label(.SROWSTORBZ)
-
-	vmovupd(ymm4, mem(rcx))
-	vmovupd(ymm5, mem(rcx, rsi, 8))
-	add(rdi, rcx)
-
-	vmovupd(ymm8, mem(rcx))
-	vmovupd(ymm9, mem(rcx, rsi, 8))
-	add(rdi, rcx)
-
-	vmovupd(ymm12, mem(rcx))
-	vmovupd(ymm13, mem(rcx, rsi, 8))
-
-	jmp(.SDONE)                        // jump to end.
-
-	label(.SCOLSTORBZ)
-
-	/****3x4 tile going to save into 4x3 tile in C*****/
-	mov(var(cs_c), rsi)        // load cs_c
-	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
-	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof(dt)
-
-	/******************Transpose top tile 4x3***************************/
-	vmovups(xmm4, mem(rcx))
-	vmovups(xmm8, mem(rcx, 16))
-	vmovups(xmm12, mem(rcx,32))
-
-	add(rsi, rcx)
-
-	vextractf128(imm(0x1), ymm4, xmm4)
-	vextractf128(imm(0x1), ymm8, xmm8)
-	vextractf128(imm(0x1), ymm12, xmm12)
-	vmovups(xmm4, mem(rcx))
-	vmovups(xmm8, mem(rcx, 16))
-	vmovups(xmm12, mem(rcx,32))
-
-	add(rsi, rcx)
-	
-	vmovups(xmm5, mem(rcx))
-	vmovups(xmm9, mem(rcx, 16))
-	vmovups(xmm13,mem(rcx,32))
-	
-	add(rsi, rcx)
-	
-	vextractf128(imm(0x1), ymm5, xmm5)
-	vextractf128(imm(0x1), ymm9, xmm9)
-	vextractf128(imm(0x1), ymm13, xmm13)
-	vmovups(xmm5, mem(rcx))
-	vmovups(xmm9, mem(rcx, 16))
-	vmovups(xmm13,mem(rcx,32))
-
-	label(.SDONE)
-
-	lea(mem(r12, rdi, 2), r12)
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)
-	lea(mem(r14, r8,  1), r14)         //a_ii = r14 += 3*rs_a
-
-	dec(r11)                           // ii -= 1;
-	jne(.SLOOP3X8I)                    // iterate again if ii != 0.
-
-	label(.SRETURN)
-
-	end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        dcomplex*  cij = c + i_edge*rs_c;
-        dcomplex*  ai  = a + i_edge*rs_a;
-        dcomplex*  bj  = b;
-
-		zgemmsup_ker_ft ker_fps[3] =
-		{
-		  NULL,
-		  bli_zgemmsup_rv_zen_asm_1x4,
-		  bli_zgemmsup_rv_zen_asm_2x4,
-		};
-
-		zgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
-
-		ker_fp
-		(
-		  conja, conjb, m_left, nr_cur, k0,
-		  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-		  beta, cij, rs_c0, cs_c0, data, cntx
-		);
-		return;
-
-	}
-
-}
-
-void bli_zgemmsup_rv_zen_asm_3x2m
-     (
-       conj_t       conja,
-       conj_t       conjb,
-       dim_t        m0,
-       dim_t        n0,
-       dim_t        k0,
-       dcomplex*    restrict alpha,
-       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       dcomplex*    restrict beta,
-       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
-
-     )
-{
-	//void*    a_next = bli_auxinfo_next_a( data );
-	//void*    b_next = bli_auxinfo_next_b( data );
-
-	// Typecast local copies of integers in case dim_t and inc_t are a
-	// different size than is expected by load instructions.
-
-	uint64_t k_iter = k0 / 4;
-	uint64_t k_left = k0 % 4;
-
-	uint64_t m_iter = m0 / 3;
-	uint64_t m_left = m0 % 3;
-
-	uint64_t rs_a   = rs_a0;
-	uint64_t cs_a   = cs_a0;
-	uint64_t rs_b   = rs_b0;
-	uint64_t cs_b   = cs_b0;
-	uint64_t rs_c   = rs_c0;
-	uint64_t cs_c   = cs_c0;
-
-	if ( m_iter == 0 ) goto consider_edge_cases;
-
-	// -------------------------------------------------------------------------
-
-	begin_asm()
-
-	mov(var(a), r14)                   // load address of a.
-	mov(var(rs_a), r8)                 // load rs_a
-	mov(var(cs_a), r9)                 // load cs_a
-	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(dt)
-	lea(mem(, r8, 2), r8)              // rs_a *= sizeof(dt)
-	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(dt)
-	lea(mem(, r9, 2), r9)              // cs_a *= sizeof(dt)
-
-//	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-
-	mov(var(rs_b), r10)                // load rs_b
-	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(dt)
-	lea(mem(, r10, 2), r10)            // rs_b *= sizeof(dt)
-
-	                                   // NOTE: We cannot pre-load elements of a or b
-	                                   // because it could eventually, in the last
-	                                   // unrolled iter or the cleanup loop, result
-	                                   // in reading beyond the bounds allocated mem
-	                                   // (the likely result: a segmentation fault).
-
-	mov(var(c), r12)                   // load address of c
-	mov(var(rs_c), rdi)                // load rs_c
-	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(dt)
-	lea(mem(, rdi, 2), rdi)            // rs_c *= sizeof(dt)
-
-	// During preamble and loops:
-	// r12 = rcx = c
-	// r14 = rax = a
-	// read rbx from var(b) near beginning of loop
-	// r11 = m dim index ii
-
-	mov(var(m_iter), r11)              // ii = m_iter;
-
-	label(.SLOOP3X8I)                 // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-	vzeroall()                         // zero all xmm/ymm registers.
-
-	mov(var(b), rbx)                   // load address of b.
-	//mov(r12, rcx)                    // reset rcx to current utile of c.
-	mov(r14, rax)                      // reset rax to current upanel of a.
-
-	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
-	jz(.SCOLPFETCH)                    // jump to column storage case
-	label(.SROWPFETCH)                 // row-stored pre-fetching on c // not used
-
-	lea(mem(r12, rdi, 2), rdx)         //
-	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
-
-	jmp(.SPOSTPFETCH)                  // jump to end of pre-fetching c
-	label(.SCOLPFETCH)                 // column-stored pre-fetching c
-
-	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
-	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(dt)
-	lea(mem(r12, rsi, 2), rdx)         //
-	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
-
-	label(.SPOSTPFETCH)                // done prefetching c
-
-	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
-	lea(mem(rax, r8,  4), rdx)         // use rdx for pre-fetching lines
-	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
-
-	mov(var(k_iter), rsi)              // i = k_iter;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
-	                                   // contains the k_left loop.
-
-	label(.SLOOPKITER)                 // MAIN LOOP
-
-	// ---------------------------------- iteration 0
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-
-	vbroadcastsd(mem(rax, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-
-	add(r9, rax)                       // a += cs_a;
-
-	// ---------------------------------- iteration 1
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-
-	vbroadcastsd(mem(rax, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-
-	add(r9, rax)                       // a += cs_a;
-
-	// ---------------------------------- iteration 2
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-
-	vbroadcastsd(mem(rax, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-
-	add(r9, rax)                       // a += cs_a;
-
-	// ---------------------------------- iteration 3
-	lea(mem(rdx, r9,  4), rdx)         // a_prefetch += 4*cs_a;
-
-	vmovupd(mem(rbx, 0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-
-	vbroadcastsd(mem(rax, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-
-	add(r9, rax)                       // a += cs_a;
-
-	dec(rsi)                           // i -= 1;
-	jne(.SLOOPKITER)                   // iterate again if i != 0.
-
-	label(.SCONSIDKLEFT)
-
-	mov(var(k_left), rsi)              // i = k_left;
-	test(rsi, rsi)                     // check i via logical AND.
-	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-	                                   // else, we prepare to enter k_left loop.
-
-	label(.SLOOPKLEFT)                 // EDGE LOOP
-
-	vmovupd(mem(rbx,  0*32), ymm0)
-	add(r10, rbx)                      // b += rs_b;
-
-	vbroadcastsd(mem(rax        ), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm4)
-
-	vbroadcastsd(mem(rax, r8, 1), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm8)
-
-	vbroadcastsd(mem(rax, r8,  2), ymm2)
-	vfmadd231pd(ymm0, ymm2, ymm12)
-
-	vbroadcastsd(mem(rax, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm6)
-
-	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm10)
-
-	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
-	vfmadd231pd(ymm0, ymm3, ymm14)
-
-	add(r9, rax)                       // a += cs_a;
-
-	dec(rsi)                           // i -= 1;
-	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-
-	label(.SPOSTACCUM)
-
-	mov(r12, rcx)                      // reset rcx to current utile of c.
-
-	// permute even and odd elements
-	 // of ymm6/7, ymm10/11, ymm/14/15
-	vpermilpd(imm(0x5), ymm6, ymm6)
-	vpermilpd(imm(0x5), ymm10, ymm10)
-	vpermilpd(imm(0x5), ymm14, ymm14)
-
-	// subtract/add even/odd elements
-	vaddsubpd(ymm6, ymm4, ymm4)
-
-	vaddsubpd(ymm10, ymm8, ymm8)
-
-	vaddsubpd(ymm14, ymm12, ymm12)
-
-	/* (ar + ai) x AB */
-	mov(var(alpha), rax) // load address of alpha
-	vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
-	vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
-
-	vpermilpd(imm(0x5), ymm4, ymm3)
-	vmulpd(ymm0, ymm4, ymm4)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm4, ymm4)
-
-	vpermilpd(imm(0x5), ymm8, ymm3)
-	vmulpd(ymm0, ymm8, ymm8)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm8, ymm8)
-
-	vpermilpd(imm(0x5), ymm12, ymm3)
-	vmulpd(ymm0, ymm12, ymm12)
-	vmulpd(ymm1, ymm3, ymm3)
-	vaddsubpd(ymm3, ymm12, ymm12)
-
-	/* (�r + �i)x C + ((ar + ai) x AB) */
-	mov(var(beta), rbx) // load address of beta
-	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
-	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
-
-	mov(var(cs_c), rsi)        // load cs_c
-	lea(mem(, rsi, 4), rsi)    // rsi = cs_c * sizeof(dt)
-
-	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
-	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-
-	 // now avoid loading C if beta == 0
-	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
-	vucomisd(xmm0, xmm1) // set ZF if beta_r == 0.
-	sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 );
-	vucomisd(xmm0, xmm2) // set ZF if beta_i == 0.
-	sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 );
-	and(r13b, r15b) // set ZF if r13b & r15b == 1.
-	jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
-
-	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
-	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
-	jz(.SCOLSTORED)                    // jump to column storage case
-
-	label(.SROWSTORED)
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm4, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-
-	add(rdi, rcx) // rcx = c + 1*rs_c
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm8, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-
-	add(rdi, rcx) // rcx = c + 2*rs_c
-
-	ZGEMM_INPUT_SCALE_RS_BETA_NZ
-	vaddpd(ymm12, ymm0, ymm0)
-	ZGEMM_OUTPUT_RS
-
-	jmp(.SDONE)                        // jump to end.
-
-	label(.SCOLSTORED)
-	/*|--------|           |-------|
-	  |        |           |       |
-	  |    3x2 |           |  2x3  |
-	  |--------|           |-------|
-	*/
-
-	mov(var(cs_c), rsi)        // load cs_c
-	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
-	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
-	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
-
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm4, ymm0, ymm4)
-
-	add(rdi, rcx)
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm8, ymm0, ymm8)
-	add(rdi, rcx)
-	
-	ZGEMM_INPUT_SCALE_CS_BETA_NZ
-	vaddpd(ymm12, ymm0, ymm12)
-
-	mov(r12, rcx)                      // reset rcx to current utile of c.
-
-	/****3x2 tile going to save into 2x3 tile in C*****/
-	mov(var(cs_c), rsi)        // load cs_c
-	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
-	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof(dt)
-
-	/******************Transpose top tile 2x3***************************/
-	vmovups(xmm4, mem(rcx))
-	vmovups(xmm8, mem(rcx, 16))
-	vmovups(xmm12, mem(rcx,32))
-
-	add(rsi, rcx)
-
-	vextractf128(imm(0x1), ymm4, xmm4)
-	vextractf128(imm(0x1), ymm8, xmm8)
-	vextractf128(imm(0x1), ymm12, xmm12)
-	vmovups(xmm4, mem(rcx))
-	vmovups(xmm8, mem(rcx, 16))
-	vmovups(xmm12, mem(rcx,32))
-
-
-	jmp(.SDONE)                        // jump to end.
-
-	label(.SBETAZERO)
-
-	cmp(imm(16), rdi)                   // set ZF if (8*rs_c) == 8.
-	jz(.SCOLSTORBZ)                    // jump to column storage case
-
-	label(.SROWSTORBZ)
-
-	vmovupd(ymm4, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm8, mem(rcx))
-	add(rdi, rcx)
-	
-	vmovupd(ymm12, mem(rcx))
-
-	jmp(.SDONE)                        // jump to end.
-
-	label(.SCOLSTORBZ)
-
-	/****3x2 tile going to save into 2x3 tile in C*****/
-	mov(var(cs_c), rsi)        // load cs_c
-	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
-	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof(dt)
-
-	/******************Transpose tile 3x2***************************/
-	vmovups(xmm4, mem(rcx))
-	vmovups(xmm8, mem(rcx, 16))
-	vmovups(xmm12, mem(rcx,32))
-
-	add(rsi, rcx)
-
-	vextractf128(imm(0x1), ymm4, xmm4)
-	vextractf128(imm(0x1), ymm8, xmm8)
-	vextractf128(imm(0x1), ymm12, xmm12)
-	vmovups(xmm4, mem(rcx))
-	vmovups(xmm8, mem(rcx, 16))
-	vmovups(xmm12, mem(rcx,32))
-
-	label(.SDONE)
-
-	lea(mem(r12, rdi, 2), r12)
-	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-	lea(mem(r14, r8,  2), r14)
-	lea(mem(r14, r8,  1), r14)         //a_ii = r14 += 3*rs_a
-
-	dec(r11)                           // ii -= 1;
-	jne(.SLOOP3X8I)                    // iterate again if ii != 0.
-
-	label(.SRETURN)
-
-	end_asm(
-	: // output operands (none)
-	: // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter] "m" (k_iter),
-      [k_left] "m" (k_left),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-	: // register clobber list
-	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-	  "xmm0", "xmm1", "xmm2", "xmm3",
-	  "xmm4", "xmm5", "xmm6", "xmm7",
-	  "xmm8", "xmm9", "xmm10", "xmm11",
-	  "xmm12", "xmm13", "xmm14", "xmm15",
-	  "memory"
-	)
-
-	consider_edge_cases:
-
-	// Handle edge cases in the m dimension, if they exist.
-	if ( m_left )
-	{
-		const dim_t      nr_cur = 4;
-		const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-		dcomplex*  cij = c + i_edge*rs_c;
-		dcomplex*  ai  = a + i_edge*rs_a;
-		dcomplex*  bj  = b;
-
-		zgemmsup_ker_ft ker_fps[3] =
-		{
-		  NULL,
-		  bli_zgemmsup_rv_zen_asm_1x2,
-		  bli_zgemmsup_rv_zen_asm_2x2,
-		};
-
-		zgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
-
-		ker_fp
-		(
-		  conja, conjb, m_left, nr_cur, k0,
-		  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-		  beta, cij, rs_c0, cs_c0, data, cntx
-		);
-		return;
-	}
-}
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+// assumes beta.r, beta.i have been broadcast into ymm1, ymm2.
+// outputs to ymm0
+#define ZGEMM_INPUT_SCALE_CS_BETA_NZ \
+	vmovupd(mem(rcx), xmm0) \
+	vmovupd(mem(rcx, rsi, 1), xmm3) \
+	vinsertf128(imm(1), xmm3, ymm0, ymm0) \
+	vpermilpd(imm(0x5), ymm0, ymm3) \
+	vmulpd(ymm1, ymm0, ymm0) \
+	vmulpd(ymm2, ymm3, ymm3) \
+	vaddsubpd(ymm3, ymm0, ymm0)
+
+#define ZGEMM_INPUT_SCALE_RS_BETA_NZ \
+	vmovupd(mem(rcx), ymm0) \
+	vpermilpd(imm(0x5), ymm0, ymm3) \
+	vmulpd(ymm1, ymm0, ymm0) \
+	vmulpd(ymm2, ymm3, ymm3) \
+	vaddsubpd(ymm3, ymm0, ymm0)
+
+#define ZGEMM_OUTPUT_RS \
+	vmovupd(ymm0, mem(rcx)) \
+
+#define ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT \
+	vmovupd(mem(rcx, rsi, 8), ymm0) \
+	vpermilpd(imm(0x5), ymm0, ymm3) \
+	vmulpd(ymm1, ymm0, ymm0) \
+	vmulpd(ymm2, ymm3, ymm3) \
+	vaddsubpd(ymm3, ymm0, ymm0)
+
+#define ZGEMM_OUTPUT_RS_NEXT \
+	vmovupd(ymm0, mem(rcx, rsi, 8)) \
+
+/*
+   rrr:
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+
+   rcr:
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+
+   Assumptions:
+   - B is row-stored;
+   - A is row- or column-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential kernel is well-suited for contiguous
+   (v)ector loads on B and single-element broadcasts from A.
+
+   NOTE: These kernels explicitly support column-oriented IO, implemented
+   via an in-register transpose. And thus they also support the crr and
+   ccr cases, though only crr is ever utilized (because ccr is handled by
+   transposing the operation and executing rcr, which does not incur the
+   cost of the in-register transpose).
+
+   crr:
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ 
+	 --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+*/
+void bli_zgemmsup_rv_zen_asm_3x4m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*   restrict data,
+       cntx_t*      restrict cntx
+     )
+{
+	uint64_t n_left = n0 % 4;
+
+	// First check whether this is a edge case in the n dimension. If so,
+	// dispatch other 3x?m kernels, as needed.
+	if (n_left )
+	{
+        dcomplex*  cij = c;
+        dcomplex*  bj  = b;
+        dcomplex*  ai  = a;
+
+		if ( 2 <= n_left )
+		{
+			const dim_t nr_cur = 2;
+
+			bli_zgemmsup_rv_zen_asm_3x2m
+			(
+			  conja, conjb, m0, nr_cur, k0,
+			  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+			  beta, cij, rs_c0, cs_c0, data, cntx
+			);
+			cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+		}
+		if ( 1 == n_left )
+		{
+			bli_zgemv_ex
+			(
+			  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+			  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+			  beta, cij, rs_c0, cntx, NULL
+			);
+		}
+
+		return;
+	}
+
+	//void*    a_next = bli_auxinfo_next_a( data );
+	//void*    b_next = bli_auxinfo_next_b( data );
+
+	// Typecast local copies of integers in case dim_t and inc_t are a
+	// different size than is expected by load instructions.
+
+	uint64_t k_iter = k0 / 4;
+	uint64_t k_left = k0 % 4;
+
+	uint64_t m_iter = m0 / 3;
+	uint64_t m_left = m0 % 3;
+
+	uint64_t rs_a   = rs_a0;
+	uint64_t cs_a   = cs_a0;
+	uint64_t rs_b   = rs_b0;
+	uint64_t cs_b   = cs_b0;
+	uint64_t rs_c   = rs_c0;
+	uint64_t cs_c   = cs_c0;
+
+	if ( m_iter == 0 ) goto consider_edge_cases;
+
+	// -------------------------------------------------------------------------
+
+	begin_asm()
+
+	mov(var(a), r14)                   // load address of a.
+	mov(var(rs_a), r8)                 // load rs_a
+	mov(var(cs_a), r9)                 // load cs_a
+	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(real dt)
+	lea(mem(, r8, 2), r8)              // rs_a *= sizeof((real + imag) dt)
+	lea(mem(, r9, 8), r9)              // cs_a *= sizeof( real dt)
+	lea(mem(, r9, 2), r9)              // cs_a *= sizeof((real + imag) dt)
+
+	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
+
+	mov(var(rs_b), r10)                // load rs_b
+	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(real dt)
+	lea(mem(, r10, 2), r10)            // rs_b *= sizeof((real +imag) dt)
+
+	                                   // NOTE: We cannot pre-load elements of a or b
+	                                   // because it could eventually, in the last
+	                                   // unrolled iter or the cleanup loop, result
+	                                   // in reading beyond the bounds allocated mem
+	                                   // (the likely result: a segmentation fault).
+
+	mov(var(c), r12)                   // load address of c
+	mov(var(rs_c), rdi)                // load rs_c
+	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(dt)
+	lea(mem(, rdi, 2), rdi)            // rs_c *= sizeof(dt)
+
+	// During preamble and loops:
+	// r12 = rcx = c
+	// r14 = rax = a
+	// read rbx from var(b) near beginning of loop
+	// r11 = m dim index ii
+
+	mov(var(m_iter), r11)              // ii = m_iter;
+
+	label(.SLOOP3X8I)                 // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+	vzeroall()                         // zero all xmm/ymm registers.
+
+	mov(var(b), rbx)                   // load address of b.
+	mov(r14, rax)                      // reset rax to current upanel of a.
+
+	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
+	jz(.SCOLPFETCH)                    // jump to column storage case
+	label(.SROWPFETCH)                 // row-stored pre-fetching on c // not used
+
+	lea(mem(r12, rdi, 2), rdx)         //
+	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
+	jmp(.SPOSTPFETCH)                  // jump to end of pre-fetching c
+	label(.SCOLPFETCH)                 // column-stored pre-fetching c
+
+	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
+	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(dt)
+	lea(mem(r12, rsi, 2), rdx)         //
+	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
+
+	label(.SPOSTPFETCH)                // done prefetching c
+
+	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
+	lea(mem(rax, r8,  4), rdx)         // use rdx for pre-fetching lines
+	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
+
+	mov(var(k_iter), rsi)              // i = k_iter;
+	test(rsi, rsi)                     // check i via logical AND.
+	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
+	                                   // contains the k_left loop.
+
+	label(.SLOOPKITER)                 // MAIN LOOP
+
+	// ---------------------------------- iteration 0
+
+	vmovupd(mem(rbx,  0*32), ymm0)
+	vmovupd(mem(rbx,  1*32), ymm1)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+	vfmadd231pd(ymm1, ymm2, ymm5)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+	vfmadd231pd(ymm1, ymm2, ymm9)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+	vfmadd231pd(ymm1, ymm2, ymm13)
+
+	vbroadcastsd(mem(rax, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+	vfmadd231pd(ymm1, ymm3, ymm7)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+	vfmadd231pd(ymm1, ymm3, ymm11)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+	vfmadd231pd(ymm1, ymm3, ymm15)
+
+	add(r9, rax)                       // a += cs_a;
+
+	// ---------------------------------- iteration 1
+
+	vmovupd(mem(rbx,  0*32), ymm0)
+	vmovupd(mem(rbx,  1*32), ymm1)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+	vfmadd231pd(ymm1, ymm2, ymm5)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+	vfmadd231pd(ymm1, ymm2, ymm9)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+	vfmadd231pd(ymm1, ymm2, ymm13)
+
+	vbroadcastsd(mem(rax, 8    ), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+	vfmadd231pd(ymm1, ymm3, ymm7)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+	vfmadd231pd(ymm1, ymm3, ymm11)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+	vfmadd231pd(ymm1, ymm3, ymm15)
+
+	add(r9, rax)                       // a += cs_a;
+
+	// ---------------------------------- iteration 2
+
+	vmovupd(mem(rbx,  0*32), ymm0)
+	vmovupd(mem(rbx,  1*32), ymm1)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+	vfmadd231pd(ymm1, ymm2, ymm5)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+	vfmadd231pd(ymm1, ymm2, ymm9)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+	vfmadd231pd(ymm1, ymm2, ymm13)
+
+	vbroadcastsd(mem(rax, 8 ), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+	vfmadd231pd(ymm1, ymm3, ymm7)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+	vfmadd231pd(ymm1, ymm3, ymm11)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+	vfmadd231pd(ymm1, ymm3, ymm15)
+
+	add(r9, rax)                       // a += cs_a;
+
+	// ---------------------------------- iteration 3
+	lea(mem(rdx, r9,  4), rdx)         // a_prefetch += 4*cs_a;
+
+	vmovupd(mem(rbx, 0*32), ymm0)
+	vmovupd(mem(rbx, 1*32), ymm1)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+	vfmadd231pd(ymm1, ymm2, ymm5)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+	vfmadd231pd(ymm1, ymm2, ymm9)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+	vfmadd231pd(ymm1, ymm2, ymm13)
+
+	vbroadcastsd(mem(rax, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+	vfmadd231pd(ymm1, ymm3, ymm7)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+	vfmadd231pd(ymm1, ymm3, ymm11)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+	vfmadd231pd(ymm1, ymm3, ymm15)
+
+	add(r9, rax)                       // a += cs_a;
+
+	dec(rsi)                           // i -= 1;
+	jne(.SLOOPKITER)                   // iterate again if i != 0.
+
+	label(.SCONSIDKLEFT)
+
+	mov(var(k_left), rsi)              // i = k_left;
+	test(rsi, rsi)                     // check i via logical AND.
+	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+	                                   // else, we prepare to enter k_left loop.
+
+	label(.SLOOPKLEFT)                 // EDGE LOOP
+
+	vmovupd(mem(rbx,  0*32), ymm0)
+	vmovupd(mem(rbx,  1*32), ymm1)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+	vfmadd231pd(ymm1, ymm2, ymm5)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+	vfmadd231pd(ymm1, ymm2, ymm9)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+	vfmadd231pd(ymm1, ymm2, ymm13)
+
+	vbroadcastsd(mem(rax, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+	vfmadd231pd(ymm1, ymm3, ymm7)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+	vfmadd231pd(ymm1, ymm3, ymm11)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+	vfmadd231pd(ymm1, ymm3, ymm15)
+
+	add(r9, rax)                       // a += cs_a;
+
+	dec(rsi)                           // i -= 1;
+	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
+
+	label(.SPOSTACCUM)
+
+	mov(r12, rcx)                      // reset rcx to current utile of c.
+
+	// permute even and odd elements
+	 // of ymm6/7, ymm10/11, ymm/14/15
+	vpermilpd(imm(0x5), ymm6, ymm6)
+	vpermilpd(imm(0x5), ymm7, ymm7)
+	vpermilpd(imm(0x5), ymm10, ymm10)
+	vpermilpd(imm(0x5), ymm11, ymm11)
+	vpermilpd(imm(0x5), ymm14, ymm14)
+	vpermilpd(imm(0x5), ymm15, ymm15)
+
+	 // subtract/add even/odd elements
+	vaddsubpd(ymm6, ymm4, ymm4)
+	vaddsubpd(ymm7, ymm5, ymm5)
+
+	vaddsubpd(ymm10, ymm8, ymm8)
+	vaddsubpd(ymm11, ymm9, ymm9)
+
+	vaddsubpd(ymm14, ymm12, ymm12)
+	vaddsubpd(ymm15, ymm13, ymm13)
+
+	/* (ar + ai) x AB */
+	mov(var(alpha), rax) // load address of alpha
+	vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
+	vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
+
+	vpermilpd(imm(0x5), ymm4, ymm3)
+	vmulpd(ymm0, ymm4, ymm4)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm4, ymm4)
+
+	vpermilpd(imm(0x5), ymm5, ymm3)
+	vmulpd(ymm0, ymm5, ymm5)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm5, ymm5)
+
+	vpermilpd(imm(0x5), ymm8, ymm3)
+	vmulpd(ymm0, ymm8, ymm8)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm8, ymm8)
+
+	vpermilpd(imm(0x5), ymm9, ymm3)
+	vmulpd(ymm0, ymm9, ymm9)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm9, ymm9)
+
+	vpermilpd(imm(0x5), ymm12, ymm3)
+	vmulpd(ymm0, ymm12, ymm12)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm12, ymm12)
+
+	vpermilpd(imm(0x5), ymm13, ymm3)
+	vmulpd(ymm0, ymm13, ymm13)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm13, ymm13)
+
+	/* (�r + �i)x C + ((ar + ai) x AB) */
+	mov(var(beta), rbx) // load address of beta
+	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
+	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
+
+	mov(var(cs_c), rsi)        // load cs_c
+	lea(mem(, rsi, 4), rsi)    // rsi = cs_c * sizeof(dt)
+
+	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
+	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
+
+	// now avoid loading C if beta == 0
+	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
+	vucomisd(xmm0, xmm1) // set ZF if beta_r == 0.
+	sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 );
+	vucomisd(xmm0, xmm2) // set ZF if beta_i == 0.
+	sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 );
+	and(r13b, r15b) // set ZF if r13b & r15b == 1.
+	jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
+
+	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
+
+	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) ==16.
+	jz(.SCOLSTORED)                    // jump to column storage case
+
+	label(.SROWSTORED)
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ
+	vaddpd(ymm4, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
+	vaddpd(ymm5, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS_NEXT
+	add(rdi, rcx) // rcx = c + 1*rs_c
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ
+	vaddpd(ymm8, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
+	vaddpd(ymm9, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS_NEXT
+	add(rdi, rcx) // rcx = c + 2*rs_c
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ
+	vaddpd(ymm12, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ_NEXT
+	vaddpd(ymm13, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS_NEXT
+
+	jmp(.SDONE)                        // jump to end.
+
+	label(.SCOLSTORED)
+	/*|--------|           |-------|
+	  |        |           |       |
+	  |    3x4 |           |  4x3  |
+	  |--------|           |-------|
+	*/
+
+	mov(var(cs_c), rsi)        // load cs_c
+	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
+	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real +imag)dt)
+	lea(mem(rsi, rsi, 2), r13) // r13 = 3*rs_a
+
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm4, ymm0, ymm4)
+
+	add(rdi, rcx)
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm8, ymm0, ymm8)
+	add(rdi, rcx)
+
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm12, ymm0, ymm12)
+
+	lea(mem(r12, rsi, 2), rcx)
+
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm5, ymm0, ymm5)
+	add(rdi, rcx)
+
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm9, ymm0, ymm9)
+	add(rdi, rcx) 
+
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm13, ymm0, ymm13)
+
+	mov(r12, rcx)                      // reset rcx to current utile of c.
+
+
+	/****3x4 tile going to save into 4x3 tile in C*****/
+	mov(var(cs_c), rsi)        // load cs_c
+	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
+	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real +imag)dt)
+
+	/******************Transpose top tile 4x3***************************/
+	vmovups(xmm4, mem(rcx))
+	vmovups(xmm8, mem(rcx, 16))
+	vmovups(xmm12, mem(rcx,32))
+
+	add(rsi, rcx)
+
+	vextractf128(imm(0x1), ymm4, xmm4)
+	vextractf128(imm(0x1), ymm8, xmm8)
+	vextractf128(imm(0x1), ymm12, xmm12)
+	vmovups(xmm4, mem(rcx))
+	vmovups(xmm8, mem(rcx, 16))
+	vmovups(xmm12, mem(rcx,32))
+
+	add(rsi, rcx)
+	
+	vmovups(xmm5, mem(rcx))
+	vmovups(xmm9, mem(rcx, 16))
+	vmovups(xmm13,mem(rcx,32))
+	
+	add(rsi, rcx)
+	
+	vextractf128(imm(0x1), ymm5, xmm5)
+	vextractf128(imm(0x1), ymm9, xmm9)
+	vextractf128(imm(0x1), ymm13, xmm13)
+	vmovups(xmm5, mem(rcx))
+	vmovups(xmm9, mem(rcx, 16))
+	vmovups(xmm13,mem(rcx,32))
+
+	jmp(.SDONE)                        // jump to end.
+
+	label(.SBETAZERO)
+	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
+	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
+	jz(.SCOLSTORBZ)                    // jump to column storage case
+
+	label(.SROWSTORBZ)
+
+	vmovupd(ymm4, mem(rcx))
+	vmovupd(ymm5, mem(rcx, rsi, 8))
+	add(rdi, rcx)
+
+	vmovupd(ymm8, mem(rcx))
+	vmovupd(ymm9, mem(rcx, rsi, 8))
+	add(rdi, rcx)
+
+	vmovupd(ymm12, mem(rcx))
+	vmovupd(ymm13, mem(rcx, rsi, 8))
+
+	jmp(.SDONE)                        // jump to end.
+
+	label(.SCOLSTORBZ)
+
+	/****3x4 tile going to save into 4x3 tile in C*****/
+	mov(var(cs_c), rsi)        // load cs_c
+	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
+	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof(dt)
+
+	/******************Transpose top tile 4x3***************************/
+	vmovups(xmm4, mem(rcx))
+	vmovups(xmm8, mem(rcx, 16))
+	vmovups(xmm12, mem(rcx,32))
+
+	add(rsi, rcx)
+
+	vextractf128(imm(0x1), ymm4, xmm4)
+	vextractf128(imm(0x1), ymm8, xmm8)
+	vextractf128(imm(0x1), ymm12, xmm12)
+	vmovups(xmm4, mem(rcx))
+	vmovups(xmm8, mem(rcx, 16))
+	vmovups(xmm12, mem(rcx,32))
+
+	add(rsi, rcx)
+	
+	vmovups(xmm5, mem(rcx))
+	vmovups(xmm9, mem(rcx, 16))
+	vmovups(xmm13,mem(rcx,32))
+	
+	add(rsi, rcx)
+	
+	vextractf128(imm(0x1), ymm5, xmm5)
+	vextractf128(imm(0x1), ymm9, xmm9)
+	vextractf128(imm(0x1), ymm13, xmm13)
+	vmovups(xmm5, mem(rcx))
+	vmovups(xmm9, mem(rcx, 16))
+	vmovups(xmm13,mem(rcx,32))
+
+	label(.SDONE)
+
+	lea(mem(r12, rdi, 2), r12)
+	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+	lea(mem(r14, r8,  2), r14)
+	lea(mem(r14, r8,  1), r14)         //a_ii = r14 += 3*rs_a
+
+	dec(r11)                           // ii -= 1;
+	jne(.SLOOP3X8I)                    // iterate again if ii != 0.
+
+	label(.SRETURN)
+
+	end_asm(
+	: // output operands (none)
+	: // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+	: // register clobber list
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+	  "xmm0", "xmm1", "xmm2", "xmm3",
+	  "xmm4", "xmm5", "xmm6", "xmm7",
+	  "xmm8", "xmm9", "xmm10", "xmm11",
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	)
+
+	consider_edge_cases:
+
+	// Handle edge cases in the m dimension, if they exist.
+	if ( m_left )
+	{
+		const dim_t      nr_cur = 4;
+		const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        dcomplex*  cij = c + i_edge*rs_c;
+        dcomplex*  ai  = a + i_edge*rs_a;
+        dcomplex*  bj  = b;
+
+		zgemmsup_ker_ft ker_fps[3] =
+		{
+		  NULL,
+		  bli_zgemmsup_rv_zen_asm_1x4,
+		  bli_zgemmsup_rv_zen_asm_2x4,
+		};
+
+		zgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+
+		ker_fp
+		(
+		  conja, conjb, m_left, nr_cur, k0,
+		  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+		  beta, cij, rs_c0, cs_c0, data, cntx
+		);
+		return;
+
+	}
+
+}
+
+void bli_zgemmsup_rv_zen_asm_3x2m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t*   restrict data,
+       cntx_t*      restrict cntx
+
+     )
+{
+	//void*    a_next = bli_auxinfo_next_a( data );
+	//void*    b_next = bli_auxinfo_next_b( data );
+
+	// Typecast local copies of integers in case dim_t and inc_t are a
+	// different size than is expected by load instructions.
+
+	uint64_t k_iter = k0 / 4;
+	uint64_t k_left = k0 % 4;
+
+	uint64_t m_iter = m0 / 3;
+	uint64_t m_left = m0 % 3;
+
+	uint64_t rs_a   = rs_a0;
+	uint64_t cs_a   = cs_a0;
+	uint64_t rs_b   = rs_b0;
+	uint64_t cs_b   = cs_b0;
+	uint64_t rs_c   = rs_c0;
+	uint64_t cs_c   = cs_c0;
+
+	if ( m_iter == 0 ) goto consider_edge_cases;
+
+	// -------------------------------------------------------------------------
+
+	begin_asm()
+
+	mov(var(a), r14)                   // load address of a.
+	mov(var(rs_a), r8)                 // load rs_a
+	mov(var(cs_a), r9)                 // load cs_a
+	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(dt)
+	lea(mem(, r8, 2), r8)              // rs_a *= sizeof(dt)
+	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(dt)
+	lea(mem(, r9, 2), r9)              // cs_a *= sizeof(dt)
+
+//	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
+
+	mov(var(rs_b), r10)                // load rs_b
+	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(dt)
+	lea(mem(, r10, 2), r10)            // rs_b *= sizeof(dt)
+
+	                                   // NOTE: We cannot pre-load elements of a or b
+	                                   // because it could eventually, in the last
+	                                   // unrolled iter or the cleanup loop, result
+	                                   // in reading beyond the bounds allocated mem
+	                                   // (the likely result: a segmentation fault).
+
+	mov(var(c), r12)                   // load address of c
+	mov(var(rs_c), rdi)                // load rs_c
+	lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(dt)
+	lea(mem(, rdi, 2), rdi)            // rs_c *= sizeof(dt)
+
+	// During preamble and loops:
+	// r12 = rcx = c
+	// r14 = rax = a
+	// read rbx from var(b) near beginning of loop
+	// r11 = m dim index ii
+
+	mov(var(m_iter), r11)              // ii = m_iter;
+
+	label(.SLOOP3X8I)                 // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+	vzeroall()                         // zero all xmm/ymm registers.
+
+	mov(var(b), rbx)                   // load address of b.
+	//mov(r12, rcx)                    // reset rcx to current utile of c.
+	mov(r14, rax)                      // reset rax to current upanel of a.
+
+	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
+	jz(.SCOLPFETCH)                    // jump to column storage case
+	label(.SROWPFETCH)                 // row-stored pre-fetching on c // not used
+
+	lea(mem(r12, rdi, 2), rdx)         //
+	lea(mem(rdx, rdi, 1), rdx)         // rdx = c + 3*rs_c;
+
+	jmp(.SPOSTPFETCH)                  // jump to end of pre-fetching c
+	label(.SCOLPFETCH)                 // column-stored pre-fetching c
+
+	mov(var(cs_c), rsi)                // load cs_c to rsi (temporarily)
+	lea(mem(, rsi, 8), rsi)            // cs_c *= sizeof(dt)
+	lea(mem(r12, rsi, 2), rdx)         //
+	lea(mem(rdx, rsi, 1), rdx)         // rdx = c + 3*cs_c;
+
+	label(.SPOSTPFETCH)                // done prefetching c
+
+	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
+	lea(mem(rax, r8,  4), rdx)         // use rdx for pre-fetching lines
+	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
+
+	mov(var(k_iter), rsi)              // i = k_iter;
+	test(rsi, rsi)                     // check i via logical AND.
+	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
+	                                   // contains the k_left loop.
+
+	label(.SLOOPKITER)                 // MAIN LOOP
+
+	// ---------------------------------- iteration 0
+
+	vmovupd(mem(rbx,  0*32), ymm0)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+
+	vbroadcastsd(mem(rax, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+
+	add(r9, rax)                       // a += cs_a;
+
+	// ---------------------------------- iteration 1
+
+	vmovupd(mem(rbx,  0*32), ymm0)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+
+	vbroadcastsd(mem(rax, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+
+	add(r9, rax)                       // a += cs_a;
+
+	// ---------------------------------- iteration 2
+
+	vmovupd(mem(rbx,  0*32), ymm0)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+
+	vbroadcastsd(mem(rax, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+
+	add(r9, rax)                       // a += cs_a;
+
+	// ---------------------------------- iteration 3
+	lea(mem(rdx, r9,  4), rdx)         // a_prefetch += 4*cs_a;
+
+	vmovupd(mem(rbx, 0*32), ymm0)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+
+	vbroadcastsd(mem(rax, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+
+	add(r9, rax)                       // a += cs_a;
+
+	dec(rsi)                           // i -= 1;
+	jne(.SLOOPKITER)                   // iterate again if i != 0.
+
+	label(.SCONSIDKLEFT)
+
+	mov(var(k_left), rsi)              // i = k_left;
+	test(rsi, rsi)                     // check i via logical AND.
+	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+	                                   // else, we prepare to enter k_left loop.
+
+	label(.SLOOPKLEFT)                 // EDGE LOOP
+
+	vmovupd(mem(rbx,  0*32), ymm0)
+	add(r10, rbx)                      // b += rs_b;
+
+	vbroadcastsd(mem(rax        ), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm4)
+
+	vbroadcastsd(mem(rax, r8, 1), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm8)
+
+	vbroadcastsd(mem(rax, r8,  2), ymm2)
+	vfmadd231pd(ymm0, ymm2, ymm12)
+
+	vbroadcastsd(mem(rax, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm6)
+
+	vbroadcastsd(mem(rax, r8, 1, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm10)
+
+	vbroadcastsd(mem(rax, r8, 2, 8), ymm3)
+	vfmadd231pd(ymm0, ymm3, ymm14)
+
+	add(r9, rax)                       // a += cs_a;
+
+	dec(rsi)                           // i -= 1;
+	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
+
+	label(.SPOSTACCUM)
+
+	mov(r12, rcx)                      // reset rcx to current utile of c.
+
+	// permute even and odd elements
+	 // of ymm6/7, ymm10/11, ymm/14/15
+	vpermilpd(imm(0x5), ymm6, ymm6)
+	vpermilpd(imm(0x5), ymm10, ymm10)
+	vpermilpd(imm(0x5), ymm14, ymm14)
+
+	// subtract/add even/odd elements
+	vaddsubpd(ymm6, ymm4, ymm4)
+
+	vaddsubpd(ymm10, ymm8, ymm8)
+
+	vaddsubpd(ymm14, ymm12, ymm12)
+
+	/* (ar + ai) x AB */
+	mov(var(alpha), rax) // load address of alpha
+	vbroadcastsd(mem(rax), ymm0) // load alpha_r and duplicate
+	vbroadcastsd(mem(rax, 8), ymm1) // load alpha_i and duplicate
+
+	vpermilpd(imm(0x5), ymm4, ymm3)
+	vmulpd(ymm0, ymm4, ymm4)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm4, ymm4)
+
+	vpermilpd(imm(0x5), ymm8, ymm3)
+	vmulpd(ymm0, ymm8, ymm8)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm8, ymm8)
+
+	vpermilpd(imm(0x5), ymm12, ymm3)
+	vmulpd(ymm0, ymm12, ymm12)
+	vmulpd(ymm1, ymm3, ymm3)
+	vaddsubpd(ymm3, ymm12, ymm12)
+
+	/* (�r + �i)x C + ((ar + ai) x AB) */
+	mov(var(beta), rbx) // load address of beta
+	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
+	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
+
+	mov(var(cs_c), rsi)        // load cs_c
+	lea(mem(, rsi, 4), rsi)    // rsi = cs_c * sizeof(dt)
+
+	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
+	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
+
+	 // now avoid loading C if beta == 0
+	vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
+	vucomisd(xmm0, xmm1) // set ZF if beta_r == 0.
+	sete(r13b) // r13b = ( ZF == 1 ? 1 : 0 );
+	vucomisd(xmm0, xmm2) // set ZF if beta_i == 0.
+	sete(r15b) // r15b = ( ZF == 1 ? 1 : 0 );
+	and(r13b, r15b) // set ZF if r13b & r15b == 1.
+	jne(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
+
+	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
+	cmp(imm(16), rdi)                   // set ZF if (16*rs_c) == 16.
+	jz(.SCOLSTORED)                    // jump to column storage case
+
+	label(.SROWSTORED)
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ
+	vaddpd(ymm4, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS
+
+	add(rdi, rcx) // rcx = c + 1*rs_c
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ
+	vaddpd(ymm8, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS
+
+	add(rdi, rcx) // rcx = c + 2*rs_c
+
+	ZGEMM_INPUT_SCALE_RS_BETA_NZ
+	vaddpd(ymm12, ymm0, ymm0)
+	ZGEMM_OUTPUT_RS
+
+	jmp(.SDONE)                        // jump to end.
+
+	label(.SCOLSTORED)
+	/*|--------|           |-------|
+	  |        |           |       |
+	  |    3x2 |           |  2x3  |
+	  |--------|           |-------|
+	*/
+
+	mov(var(cs_c), rsi)        // load cs_c
+	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
+	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
+	
+	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
+
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm4, ymm0, ymm4)
+
+	add(rdi, rcx)
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm8, ymm0, ymm8)
+	add(rdi, rcx)
+	
+	ZGEMM_INPUT_SCALE_CS_BETA_NZ
+	vaddpd(ymm12, ymm0, ymm12)
+
+	mov(r12, rcx)                      // reset rcx to current utile of c.
+
+	/****3x2 tile going to save into 2x3 tile in C*****/
+	mov(var(cs_c), rsi)        // load cs_c
+	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
+	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof(dt)
+
+	/******************Transpose top tile 2x3***************************/
+	vmovups(xmm4, mem(rcx))
+	vmovups(xmm8, mem(rcx, 16))
+	vmovups(xmm12, mem(rcx,32))
+
+	add(rsi, rcx)
+
+	vextractf128(imm(0x1), ymm4, xmm4)
+	vextractf128(imm(0x1), ymm8, xmm8)
+	vextractf128(imm(0x1), ymm12, xmm12)
+	vmovups(xmm4, mem(rcx))
+	vmovups(xmm8, mem(rcx, 16))
+	vmovups(xmm12, mem(rcx,32))
+
+
+	jmp(.SDONE)                        // jump to end.
+
+	label(.SBETAZERO)
+
+	cmp(imm(16), rdi)                   // set ZF if (8*rs_c) == 8.
+	jz(.SCOLSTORBZ)                    // jump to column storage case
+
+	label(.SROWSTORBZ)
+
+	vmovupd(ymm4, mem(rcx))
+	add(rdi, rcx)
+	
+	vmovupd(ymm8, mem(rcx))
+	add(rdi, rcx)
+	
+	vmovupd(ymm12, mem(rcx))
+
+	jmp(.SDONE)                        // jump to end.
+
+	label(.SCOLSTORBZ)
+
+	/****3x2 tile going to save into 2x3 tile in C*****/
+	mov(var(cs_c), rsi)        // load cs_c
+	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
+	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof(dt)
+
+	/******************Transpose tile 3x2***************************/
+	vmovups(xmm4, mem(rcx))
+	vmovups(xmm8, mem(rcx, 16))
+	vmovups(xmm12, mem(rcx,32))
+
+	add(rsi, rcx)
+
+	vextractf128(imm(0x1), ymm4, xmm4)
+	vextractf128(imm(0x1), ymm8, xmm8)
+	vextractf128(imm(0x1), ymm12, xmm12)
+	vmovups(xmm4, mem(rcx))
+	vmovups(xmm8, mem(rcx, 16))
+	vmovups(xmm12, mem(rcx,32))
+
+	label(.SDONE)
+
+	lea(mem(r12, rdi, 2), r12)
+	lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+	lea(mem(r14, r8,  2), r14)
+	lea(mem(r14, r8,  1), r14)         //a_ii = r14 += 3*rs_a
+
+	dec(r11)                           // ii -= 1;
+	jne(.SLOOP3X8I)                    // iterate again if ii != 0.
+
+	label(.SRETURN)
+
+	end_asm(
+	: // output operands (none)
+	: // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+	: // register clobber list
+	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+	  "xmm0", "xmm1", "xmm2", "xmm3",
+	  "xmm4", "xmm5", "xmm6", "xmm7",
+	  "xmm8", "xmm9", "xmm10", "xmm11",
+	  "xmm12", "xmm13", "xmm14", "xmm15",
+	  "memory"
+	)
+
+	consider_edge_cases:
+
+	// Handle edge cases in the m dimension, if they exist.
+	if ( m_left )
+	{
+		const dim_t      nr_cur = 4;
+		const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+		dcomplex*  cij = c + i_edge*rs_c;
+		dcomplex*  ai  = a + i_edge*rs_a;
+		dcomplex*  bj  = b;
+
+		zgemmsup_ker_ft ker_fps[3] =
+		{
+		  NULL,
+		  bli_zgemmsup_rv_zen_asm_1x2,
+		  bli_zgemmsup_rv_zen_asm_2x2,
+		};
+
+		zgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
+
+		ker_fp
+		(
+		  conja, conjb, m_left, nr_cur, k0,
+		  alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+		  beta, cij, rs_c0, cs_c0, data, cntx
+		);
+		return;
+	}
+}
diff --git a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c
index 4eebb2b0a5..00773b3b58 100644
--- a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c
+++ b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16m.c
@@ -1,1965 +1,1965 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-
-   NOTE: These kernels implicitly support column-oriented IO, implemented
-   via an a high-level transposition of the entire operation. A and B will
-   effectively remain row- and column-stored, respectively, but C will then
-   effectively appear column-stored. Thus, this kernel may be used for both
-   rrc and crc cases.
-*/
-
-// Prototype reference microkernels.
-
-void bli_sgemmsup_rd_zen_asm_6x16m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-
-    uint64_t n_left = n0 % 16;
-
-    // First check whether this is a edge case in the n dimension. If so,
-    // dispatch other 6x?m kernels, as needed.
-    if ( n_left )
-    {
-        float* restrict cij = c;
-        float* restrict bj  = b;
-        float* restrict ai  = a;
-
-        if ( 8 <= n_left )
-        {
-            const dim_t nr_cur = 8;
-
-            bli_sgemmsup_rd_zen_asm_6x8m
-            (
-              conja, conjb, m0, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 4 <= n_left )
-        {
-            const dim_t nr_cur = 4;
-
-            bli_sgemmsup_rd_zen_asm_6x4m
-            (
-              conja, conjb, m0, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_6x2m
-            (
-              conja, conjb, m0, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, m0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-        return;
-    }
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-
-    begin_asm()
-
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), rdx)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-
-    // r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r15)                   // jj = 0;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-    mov(var(a), r14)                   // load address of a
-    mov(var(c), r12)                   // load address of c
-    mov(var(b), rdx)
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(r11, rsi)                     // rsi *= cs_b;
-    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-    mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(r14), rax)                 // rax = a_ii;
-    lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 1
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 3
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15 
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/*
+   rrc:
+     --------        ------        | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------   +=   ------ ...    | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------        ------              :
+     --------        ------              :
+
+   Assumptions:
+   - C is row-stored and B is column-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential microkernel is well-suited for
+   a dot-product-based accumulation that performs vector loads from
+   both A and B.
+
+   NOTE: These kernels implicitly support column-oriented IO, implemented
+   via an a high-level transposition of the entire operation. A and B will
+   effectively remain row- and column-stored, respectively, but C will then
+   effectively appear column-stored. Thus, this kernel may be used for both
+   rrc and crc cases.
+*/
+
+// Prototype reference microkernels.
+
+void bli_sgemmsup_rd_zen_asm_6x16m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+
+    uint64_t n_left = n0 % 16;
+
+    // First check whether this is a edge case in the n dimension. If so,
+    // dispatch other 6x?m kernels, as needed.
+    if ( n_left )
+    {
+        float* restrict cij = c;
+        float* restrict bj  = b;
+        float* restrict ai  = a;
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+
+            bli_sgemmsup_rd_zen_asm_6x8m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+
+            bli_sgemmsup_rd_zen_asm_6x4m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_6x2m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, m0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+        return;
+    }
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+
+    begin_asm()
+
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), rdx)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
+
+    // r12 = rcx = c
+    // r14 = rax = a
+    // rdx = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r15)                   // jj = 0;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
+
+
+
+    mov(var(a), r14)                   // load address of a
+    mov(var(c), r12)                   // load address of c
+    mov(var(b), rdx)
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(r11, rsi)                     // rsi *= cs_b;
+    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
+
+
+
+    mov(var(m_iter), r9)               // ii = m_iter;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(r14), rax)                 // rax = a_ii;
+    lea(mem(rdx), rbx)                 // rbx = b_jj;
+
+    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 1
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 3
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15 
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
     vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                           // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    lea(mem(r12, rdi, 2), r12)         //
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)         //
-    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
-
-    add(imm(4), r15)                   // jj += 4;
-    cmp(imm(16), r15)                   // compare jj to 4
-    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
-                                       // of jj loop; otherwise, loop ends.
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 16;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        float* restrict cij = c + i_edge*rs_c;
-        float* restrict bj  = b;
-        float* restrict ai  = a + i_edge*rs_a;
-
-        if ( 2 == m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x16
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            const dim_t mr_cur = 1;
-
-            bli_sgemmsup_rd_zen_asm_1x16
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_6x8m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-
-    begin_asm()
-
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), rdx)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-
-    // r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r15)                   // jj = 0;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-    mov(var(a), r14)                   // load address of a
-    mov(var(c), r12)                   // load address of c
-    mov(var(b), rdx)
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(r11, rsi)                     // rsi *= cs_b;
-    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-    mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(r14), rax)                 // rax = a_ii;
-    lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 1
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 3
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15 
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
     vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                           // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    lea(mem(r12, rdi, 2), r12)         //
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)         //
-    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
-
-    add(imm(4), r15)                   // jj += 4;
-    cmp(imm(8), r15)                   // compare jj to 4
-    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
-                                       // of jj loop; otherwise, loop ends.
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 8;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        float* restrict cij = c + i_edge*rs_c;
-        float* restrict bj  = b;
-        float* restrict ai  = a + i_edge*rs_a;
-
-        if ( 2 == m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x8
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            const dim_t mr_cur = 1;
-
-            bli_sgemmsup_rd_zen_asm_1x8
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-        }
-    }
-}
-
-
-
-void bli_sgemmsup_rd_zen_asm_6x4m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-
-    begin_asm()
-
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), rdx)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-
-    // r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r15)                   // jj = 0;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-    mov(var(a), r14)                   // load address of a
-    mov(var(c), r12)                   // load address of c
-    mov(var(b), rdx)
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(r11, rsi)                     // rsi *= cs_b;
-    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-    mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(r14), rax)                 // rax = a_ii;
-    lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 1
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 3
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15 
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+                                           // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    lea(mem(r12, rdi, 2), r12)         //
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)         //
+    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
+
+    dec(r9)                            // ii -= 1;
+    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
+
+    add(imm(4), r15)                   // jj += 4;
+    cmp(imm(16), r15)                   // compare jj to 4
+    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
+                                       // of jj loop; otherwise, loop ends.
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "memory"
+    )
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 16;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x16
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x16
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_6x8m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+
+    begin_asm()
+
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), rdx)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
+
+    // r12 = rcx = c
+    // r14 = rax = a
+    // rdx = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r15)                   // jj = 0;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
+
+
+
+    mov(var(a), r14)                   // load address of a
+    mov(var(c), r12)                   // load address of c
+    mov(var(b), rdx)
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(r11, rsi)                     // rsi *= cs_b;
+    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
+
+
+
+    mov(var(m_iter), r9)               // ii = m_iter;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(r14), rax)                 // rax = a_ii;
+    lea(mem(rdx), rbx)                 // rbx = b_jj;
+
+    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 1
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 3
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15 
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
     vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                           // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    lea(mem(r12, rdi, 2), r12)         //
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)         //
-    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
-
-    add(imm(4), r15)                   // jj += 4;
-    cmp(imm(4), r15)                   // compare jj to 4
-    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
-                                       // of jj loop; otherwise, loop ends.
-    label(.SRETURN)
-
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 4;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        float* restrict cij = c + i_edge*rs_c;
-        float* restrict bj  = b;
-        float* restrict ai  = a + i_edge*rs_a;
-
-        if ( 2 == m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x4
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            const dim_t mr_cur = 1;
-
-            bli_sgemmsup_rd_zen_asm_1x4
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_6x2m
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t m_iter = m0 / 3;
-    uint64_t m_left = m0 % 3;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( m_iter == 0 ) goto consider_edge_cases;
- 
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), rdx)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-
-    // r12 = rcx = c
-    // r14 = rax = a
-    // rdx = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r15)                   // jj = 0;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
-
-
-
-    mov(var(a), r14)                   // load address of a
-    mov(var(c), r12)                   // load address of c
-    mov(var(b), rdx)
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
-
-    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
-    imul(r11, rsi)                     // rsi *= cs_b;
-    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
-
-
-
-    mov(var(m_iter), r9)               // ii = m_iter;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
-
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(r14), rax)                 // rax = a_ii;
-    lea(mem(rdx), rbx)                 // rbx = b_jj;
-
-    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-
-    // ---------------------------------- iteration 1
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    // ---------------------------------- iteration 3
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
-    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
-    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  
-                                       // ymm5  ymm8 
-                                       // ymm6  ymm9 
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-    vhaddps(xmm0,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
     vaddps( xmm0, xmm1, xmm0 )
-    vhaddps(xmm0,xmm0,xmm5)
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-    vhaddps(xmm0,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7)
-                                       // ymm5 = sum(ymm5) sum(ymm8)
-                                       // ymm6 = sum(ymm6) sum(ymm9)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                           // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vmovsd(mem(rcx), xmm0)////a0a1
-    vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1)
-    vmovsd(xmm4, mem(rcx))//a0a1
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+                                           // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
     add(rdi, rcx)
-    vmovsd(mem(rcx), xmm0)
-    vfmadd231ps(xmm0, xmm3, xmm5)
-    vmovsd(xmm5, mem(rcx))
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
     add(rdi, rcx)
-    vmovsd(mem(rcx), xmm0)
-    vfmadd231ps(xmm0, xmm3, xmm6)
-    vmovsd(xmm6, mem(rcx))
 
-    jmp(.SDONE)                        // jump to end.
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+    jmp(.SDONE)                        // jump to end.
 
     label(.SBETAZERO)
-    label(.SROWSTORBZ)
 
-    vmovsd(xmm4, mem(rcx))
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
     add(rdi, rcx)
-    vmovsd(xmm5, mem(rcx))
+
+    vmovups(xmm5, mem(rcx))
     add(rdi, rcx)
-    vmovsd(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    lea(mem(r12, rdi, 2), r12)         //
-    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
-
-    lea(mem(r14, r8,  2), r14)         //
-    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
-
-    dec(r9)                            // ii -= 1;
-    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
-
-    add(imm(4), r15)                   // jj += 4;
-    cmp(imm(4), r15)                   // compare jj to 4
-    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
-                                       // of jj loop; otherwise, loop ends.
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [m_iter] "m" (m_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( m_left )
-    {
-        const dim_t      nr_cur = 2;
-        const dim_t      i_edge = m0 - ( dim_t )m_left;
-
-        float* restrict cij = c + i_edge*rs_c;
-        float* restrict bj  = b;
-        float* restrict ai  = a + i_edge*rs_a;
-
-        if ( 2 == m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            const dim_t mr_cur = 1;
-
-            bli_sgemmsup_rd_zen_asm_1x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-        }
-    }
-}
-
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    lea(mem(r12, rdi, 2), r12)         //
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)         //
+    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
+
+    dec(r9)                            // ii -= 1;
+    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
+
+    add(imm(4), r15)                   // jj += 4;
+    cmp(imm(8), r15)                   // compare jj to 4
+    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
+                                       // of jj loop; otherwise, loop ends.
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "memory"
+    )
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 8;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x8
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x8
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
+
+
+void bli_sgemmsup_rd_zen_asm_6x4m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+
+    begin_asm()
+
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), rdx)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
+
+    // r12 = rcx = c
+    // r14 = rax = a
+    // rdx = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r15)                   // jj = 0;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
+
+
+
+    mov(var(a), r14)                   // load address of a
+    mov(var(c), r12)                   // load address of c
+    mov(var(b), rdx)
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(r11, rsi)                     // rsi *= cs_b;
+    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
+
+
+
+    mov(var(m_iter), r9)               // ii = m_iter;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(r14), rax)                 // rax = a_ii;
+    lea(mem(rdx), rbx)                 // rbx = b_jj;
+
+    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 1
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 3
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15 
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+                                           // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    lea(mem(r12, rdi, 2), r12)         //
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)         //
+    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
+
+    dec(r9)                            // ii -= 1;
+    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
+
+    add(imm(4), r15)                   // jj += 4;
+    cmp(imm(4), r15)                   // compare jj to 4
+    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
+                                       // of jj loop; otherwise, loop ends.
+    label(.SRETURN)
+
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "memory"
+    )
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 4;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x4
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x4
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_6x2m
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 3;
+    uint64_t m_left = m0 % 3;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+ 
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), rdx)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+    lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
+
+    // r12 = rcx = c
+    // r14 = rax = a
+    // rdx = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r15)                   // jj = 0;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ 0 1 ... ]
+
+
+
+    mov(var(a), r14)                   // load address of a
+    mov(var(c), r12)                   // load address of c
+    mov(var(b), rdx)
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(imm(1*4), rsi)                // rsi *= cs_c = 1*8
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 4*jj*cs_c;
+
+    lea(mem(   , r15, 1), rsi)         // rsi = r15 = 4*jj;
+    imul(r11, rsi)                     // rsi *= cs_b;
+    lea(mem(rdx, rsi, 1), rdx)         // rbx = b + 4*jj*cs_b;
+
+
+
+    mov(var(m_iter), r9)               // ii = m_iter;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(r14), rax)                 // rax = a_ii;
+    lea(mem(rdx), rbx)                 // rbx = b_jj;
+
+    lea(mem(r8,  r8,  4), rdi)         // rdi = 5*rs_a
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+
+    // ---------------------------------- iteration 1
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    // ---------------------------------- iteration 3
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
+    prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
+    prefetch(0, mem(rax, rdi, 1, 0*8)) // prefetch rax + 5*cs_a
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*4;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  
+                                       // ymm5  ymm8 
+                                       // ymm6  ymm9 
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+    vhaddps(xmm0,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+    vhaddps(xmm0,xmm0,xmm5)
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+    vhaddps(xmm0,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7)
+                                       // ymm5 = sum(ymm5) sum(ymm8)
+                                       // ymm6 = sum(ymm6) sum(ymm9)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+                                           // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vmovsd(mem(rcx), xmm0)////a0a1
+    vfmadd231ps(xmm0, xmm3, xmm4)//c*beta+(a0a1)
+    vmovsd(xmm4, mem(rcx))//a0a1
+    add(rdi, rcx)
+    vmovsd(mem(rcx), xmm0)
+    vfmadd231ps(xmm0, xmm3, xmm5)
+    vmovsd(xmm5, mem(rcx))
+    add(rdi, rcx)
+    vmovsd(mem(rcx), xmm0)
+    vfmadd231ps(xmm0, xmm3, xmm6)
+    vmovsd(xmm6, mem(rcx))
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+    label(.SROWSTORBZ)
+
+    vmovsd(xmm4, mem(rcx))
+    add(rdi, rcx)
+    vmovsd(xmm5, mem(rcx))
+    add(rdi, rcx)
+    vmovsd(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    lea(mem(r12, rdi, 2), r12)         //
+    lea(mem(r12, rdi, 1), r12)         // c_ii = r12 += 3*rs_c
+
+    lea(mem(r14, r8,  2), r14)         //
+    lea(mem(r14, r8,  1), r14)         // a_ii = r14 += 3*rs_a
+
+    dec(r9)                            // ii -= 1;
+    jne(.SLOOP3X4I)                    // iterate again if ii != 0.
+
+    add(imm(4), r15)                   // jj += 4;
+    cmp(imm(4), r15)                   // compare jj to 4
+    jl(.SLOOP3X4J)                    // if jj <= 4, jump to beginning
+                                       // of jj loop; otherwise, loop ends.
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [m_iter] "m" (m_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "memory"
+    )
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 2;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            //cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
diff --git a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c
index 7f0c856130..dfe5ca28af 100644
--- a/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c
+++ b/kernels/zen/3/sup/other/bli_gemmsup_rd_zen_asm_s6x16n.c
@@ -1,1869 +1,1869 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#define BLIS_ASM_SYNTAX_ATT
-#include "bli_x86_asm_macros.h"
-
-/*
-   rrc:
-     --------        ------        | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------   +=   ------ ...    | | | | | | | |
-     --------        ------        | | | | | | | |
-     --------        ------              :
-     --------        ------              :
-
-   Assumptions:
-   - C is row-stored and B is column-stored;
-   - A is row-stored;
-   - m0 and n0 are at most MR and NR, respectively.
-   Therefore, this (r)ow-preferential microkernel is well-suited for
-   a dot-product-based accumulation that performs vector loads from
-   both A and B.
-
-   NOTE: These kernels implicitly support column-oriented IO, implemented
-   via an a high-level transposition of the entire operation. A and B will
-   effectively remain row- and column-stored, respectively, but C will then
-   effectively appear column-stored. Thus, this kernel may be used for both
-   rrc and crc cases.
-*/
-
-void bli_sgemmsup_rd_zen_asm_6x16n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/*
+   rrc:
+     --------        ------        | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------   +=   ------ ...    | | | | | | | |
+     --------        ------        | | | | | | | |
+     --------        ------              :
+     --------        ------              :
+
+   Assumptions:
+   - C is row-stored and B is column-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR and NR, respectively.
+   Therefore, this (r)ow-preferential microkernel is well-suited for
+   a dot-product-based accumulation that performs vector loads from
+   both A and B.
+
+   NOTE: These kernels implicitly support column-oriented IO, implemented
+   via an a high-level transposition of the entire operation. A and B will
+   effectively remain row- and column-stored, respectively, but C will then
+   effectively appear column-stored. Thus, this kernel may be used for both
+   rrc and crc cases.
+*/
+
+void bli_sgemmsup_rd_zen_asm_6x16n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
 {
-    uint64_t m_left = m0 % 6;
-
-    // First check whether this is a edge case in the n dimension. If so,
-    // dispatch other ?x8m kernels, as needed.
-    if ( m_left )
-    {
-        float* restrict cij = c;
-        float* restrict bj  = b;
-        float* restrict ai  = a;
-
-        // We add special handling for slightly inflated MR blocksizes
-        // at edge cases, up to a maximum of 9.
-        if ( 6 < m0 )
-        {
-            sgemmsup_ker_ft ker_fp1 = NULL;
-            sgemmsup_ker_ft ker_fp2 = NULL;
-            dim_t           mr1, mr2;
-
-            if ( m0 == 7 )
-            {
-                mr1 = 6; mr2 = 1;
-                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
-                ker_fp2 = bli_sgemmsup_rd_zen_asm_1x16n;
-            }
-            else if ( m0 == 8 )
-            {
-                mr1 = 6; mr2 = 2;
-                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
-                ker_fp2 = bli_sgemmsup_rd_zen_asm_2x16n;
-            }
-            else // if ( m0 == 9 )
-            {
-                mr1 = 6; mr2 = 3;
-                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
-                ker_fp2 = bli_sgemmsup_rd_zen_asm_3x16n;
-            }
-
-            ker_fp1
-            (
-              conja, conjb, mr1, n0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += mr1*rs_c0; ai += mr1*rs_a0;
-
-            ker_fp2
-            (
-              conja, conjb, mr2, n0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-
-            return;
-        }
-
-        if ( 3 <= m_left )
-        {
-            const dim_t mr_cur = 3;
-
-            bli_sgemmsup_rd_zen_asm_3x16n
-            (
-              conja, conjb, mr_cur, n0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 2 <= m_left )
-        {
-            const dim_t mr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x16n
-            (
-              conja, conjb, mr_cur, n0, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-        }
-        if ( 1 == m_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_TRANSPOSE, conja, k0, n0,
-              alpha, bj, rs_b0, cs_b0, ai, cs_a0,
-              beta, cij, cs_c0, cntx, NULL
-            );
-        }
-        return;
-    }
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t n_iter = n0 / 4;
-    uint64_t n_left = n0 % 4;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( n_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(a), rdx)                   // load address of a.
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-
-    // r12 = rcx = c
-    // rdx = rax = a
-    // r14 = rbx = b
-    // r9  = m dim index ii
-    // r15 = n dim index jj
-
-    mov(imm(0), r9)                    // ii = 0;
-
-    label(.SLOOP3X4I)                  // LOOP OVER ii = [ 0 1 ... ]
-
-    mov(var(b), r14)                   // load address of b
-    mov(var(c), r12)                   // load address of c
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-    imul(rdi, rsi)                     // rsi *= rs_c
-    lea(mem(r12, rsi, 1), r12)         // r12 = c + 3*ii*rs_c;
-
-    lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
-    imul(r8,  rsi)                     // rsi *= rs_a;
-    lea(mem(rdx, rsi, 1), rdx)         // rax = a + 3*ii*rs_a;
-
-    mov(var(n_iter), r15)              // jj = n_iter;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(rdx), rax)                 // rax = a_ii;
-    lea(mem(r14), rbx)                 // rbx = b_jj;
-
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-    prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-
-    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 1
-    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 3
-    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-                                       // now avoid loading C if beta == 0
-
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
-
-    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-    dec(r15)                           // jj -= 1;
-    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
-
-    add(imm(3), r9)                    // ii += 3;
-    cmp(imm(3), r9)                    // compare ii to 3
-    jle(.SLOOP3X4I)                    // if ii <= 3, jump to beginning
-                                       // of ii loop; otherwise, loop ends.
-
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( n_left )
-    {
-        const dim_t      mr_cur = 6;
-        const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-        float* restrict cij = c + j_edge*cs_c;
-        float* restrict ai  = a;
-        float* restrict bj  = b + j_edge*cs_b;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_6x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_3x16n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t n_iter = n0 / 4;
-    uint64_t n_left = n0 % 4;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( n_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(a), rdx)                   // load address of a.
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-
-    mov(var(b), r14)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-
-    mov(var(c), r12)                   // load address of c
-
-    // r12 = rcx = c
-    // rdx = rax = a
-    // r14 = rbx = b
-    // r9  = unused
-    // r15 = n dim index jj
-
-    mov(var(n_iter), r15)              // jj = n_iter;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm6,  ymm6,  ymm6)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm9,  ymm9,  ymm9)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm12, ymm12, ymm12)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-    vxorps(ymm15, ymm15, ymm15)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(rdx), rax)                 // rax = a_ii;
-    lea(mem(r14), rbx)                 // rbx = b_jj;
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-    prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
-
-    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 1
-    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    // ---------------------------------- iteration 3
-    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    vmovups(mem(rax, r8, 2), ymm2)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    vmovss(mem(rax, r8, 2), xmm2)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-    vfmadd231ps(ymm2, ymm3, ymm6)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-    vfmadd231ps(ymm2, ymm3, ymm9)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-    vfmadd231ps(ymm2, ymm3, ymm12)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-    vfmadd231ps(ymm2, ymm3, ymm15)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-
-    vhaddps( ymm9, ymm6, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm15, ymm12, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm6)
-
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
+    uint64_t m_left = m0 % 6;
+
+    // First check whether this is a edge case in the n dimension. If so,
+    // dispatch other ?x8m kernels, as needed.
+    if ( m_left )
+    {
+        float* restrict cij = c;
+        float* restrict bj  = b;
+        float* restrict ai  = a;
+
+        // We add special handling for slightly inflated MR blocksizes
+        // at edge cases, up to a maximum of 9.
+        if ( 6 < m0 )
+        {
+            sgemmsup_ker_ft ker_fp1 = NULL;
+            sgemmsup_ker_ft ker_fp2 = NULL;
+            dim_t           mr1, mr2;
+
+            if ( m0 == 7 )
+            {
+                mr1 = 6; mr2 = 1;
+                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
+                ker_fp2 = bli_sgemmsup_rd_zen_asm_1x16n;
+            }
+            else if ( m0 == 8 )
+            {
+                mr1 = 6; mr2 = 2;
+                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
+                ker_fp2 = bli_sgemmsup_rd_zen_asm_2x16n;
+            }
+            else // if ( m0 == 9 )
+            {
+                mr1 = 6; mr2 = 3;
+                ker_fp1 = bli_sgemmsup_rd_zen_asm_6x16n;
+                ker_fp2 = bli_sgemmsup_rd_zen_asm_3x16n;
+            }
+
+            ker_fp1
+            (
+              conja, conjb, mr1, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += mr1*rs_c0; ai += mr1*rs_a0;
+
+            ker_fp2
+            (
+              conja, conjb, mr2, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+
+            return;
+        }
+
+        if ( 3 <= m_left )
+        {
+            const dim_t mr_cur = 3;
+
+            bli_sgemmsup_rd_zen_asm_3x16n
+            (
+              conja, conjb, mr_cur, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 2 <= m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x16n
+            (
+              conja, conjb, mr_cur, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
+        }
+        if ( 1 == m_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_TRANSPOSE, conja, k0, n0,
+              alpha, bj, rs_b0, cs_b0, ai, cs_a0,
+              beta, cij, cs_c0, cntx, NULL
+            );
+        }
+        return;
+    }
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rdx)                   // load address of a.
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+
+    // r12 = rcx = c
+    // rdx = rax = a
+    // r14 = rbx = b
+    // r9  = m dim index ii
+    // r15 = n dim index jj
+
+    mov(imm(0), r9)                    // ii = 0;
+
+    label(.SLOOP3X4I)                  // LOOP OVER ii = [ 0 1 ... ]
+
+    mov(var(b), r14)                   // load address of b
+    mov(var(c), r12)                   // load address of c
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
+    imul(rdi, rsi)                     // rsi *= rs_c
+    lea(mem(r12, rsi, 1), r12)         // r12 = c + 3*ii*rs_c;
+
+    lea(mem(   , r9,  1), rsi)         // rsi = r9 = 3*ii;
+    imul(r8,  rsi)                     // rsi *= rs_a;
+    lea(mem(rdx, rsi, 1), rdx)         // rax = a + 3*ii*rs_a;
+
+    mov(var(n_iter), r15)              // jj = n_iter;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(rdx), rax)                 // rax = a_ii;
+    lea(mem(r14), rbx)                 // rbx = b_jj;
+
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
+    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+    prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+
+    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
+    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
+    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 1
+    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
+    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 3
+    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
+    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
                                        // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm6)
-    vmovups(xmm6, mem(rcx))
-
-
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm6, mem(rcx))
-
-    label(.SDONE)
-
-    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
-
-    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-    dec(r15)                           // jj -= 1;
-    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
-
-    label(.SRETURN)
-
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( n_left )
-    {
-        const dim_t      mr_cur = 3;
-        const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-        float* restrict cij = c + j_edge*cs_c;
-        float* restrict ai  = a;
-        float* restrict bj  = b + j_edge*cs_b;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_3x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_2x16n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t n_iter = n0 / 4;
-    uint64_t n_left = n0 % 4;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( n_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(a), rdx)                   // load address of a.
-    mov(var(rs_a), r8)                 // load rs_a
-    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
-
-    mov(var(b), r14)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
-    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-
-    mov(var(c), r12)                   // load address of c
-
-    // r12 = rcx = c
-    // rdx = rax = a
-    // r14 = rbx = b
-    // r9  = unused
-    // r15 = n dim index jj
-
-    mov(var(n_iter), r15)              // jj = n_iter;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm5,  ymm5,  ymm5)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm8,  ymm8,  ymm8)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm11, ymm11, ymm11)
-    vxorps(ymm13, ymm13, ymm13)
-    vxorps(ymm14, ymm14, ymm14)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(rdx), rax)                 // rax = a_ii;
-    lea(mem(r14), rbx)                 // rbx = b_jj;
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-
-    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
-    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    // ---------------------------------- iteration 1
-    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    // ---------------------------------- iteration 3
-    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    vmovups(mem(rax       ), ymm0)
-    vmovups(mem(rax, r8, 1), ymm1)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    vmovss(mem(rax, r8, 1), xmm1)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-    vfmadd231ps(ymm1, ymm3, ymm5)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-    vfmadd231ps(ymm1, ymm3, ymm8)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-    vfmadd231ps(ymm1, ymm3, ymm11)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-    vfmadd231ps(ymm1, ymm3, ymm14)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-    vhaddps( ymm8, ymm5, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )
-
-    vhaddps( ymm14, ymm11, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )
-
-    vhaddps(xmm2,xmm0,xmm5)
-
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-    vmulps(xmm0, xmm5, xmm5)
-    vmulps(xmm0, xmm6, xmm6)
-
-                                       // now avoid loading C if beta == 0
-
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm5)
-    vmovups(xmm5, mem(rcx))
-
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-    add(rdi, rcx)
-
-    vmovups(xmm5, mem(rcx))
-
-    label(.SDONE)
-
-    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
-
-    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-    dec(r15)                           // jj -= 1;
-    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
-
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( n_left )
-    {
-        const dim_t      mr_cur = 2;
-        const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-        float* restrict cij = c + j_edge*cs_c;
-        float* restrict ai  = a;
-        float* restrict bj  = b + j_edge*cs_b;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_2x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sgemv_ex
-            (
-              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
-              beta, cij, rs_c0, cntx, NULL
-            );
-        }
-    }
-}
-
-void bli_sgemmsup_rd_zen_asm_1x16n
-     (
-       conj_t              conja,
-       conj_t              conjb,
-       dim_t               m0,
-       dim_t               n0,
-       dim_t               k0,
-       float*    restrict alpha,
-       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
-       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
-       float*    restrict beta,
-       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
-     )
-{
-    //void*    a_next = bli_auxinfo_next_a( data );
-    //void*    b_next = bli_auxinfo_next_b( data );
-
-    // Typecast local copies of integers in case dim_t and inc_t are a
-    // different size than is expected by load instructions.
-    uint64_t k_iter32 = k0 / 32;
-    uint64_t k_left32 = k0 % 32;
-    uint64_t k_iter8  = k_left32 / 8;
-    uint64_t k_left1  = k_left32 % 8;
-
-    uint64_t n_iter = n0 / 4;
-    uint64_t n_left = n0 % 4;
-
-    uint64_t rs_a   = rs_a0;
-    uint64_t cs_a   = cs_a0;
-    uint64_t rs_b   = rs_b0;
-    uint64_t cs_b   = cs_b0;
-    uint64_t rs_c   = rs_c0;
-    uint64_t cs_c   = cs_c0;
-
-    if ( n_iter == 0 ) goto consider_edge_cases;
-
-    // -------------------------------------------------------------------------
-    begin_asm()
-
-    mov(var(a), rdx)                   // load address of a.
-
-    mov(var(b), r14)                   // load address of b.
-    mov(var(cs_b), r11)                // load cs_b
-    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-
+
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
+
+    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
+
+    dec(r15)                           // jj -= 1;
+    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
+
+    add(imm(3), r9)                    // ii += 3;
+    cmp(imm(3), r9)                    // compare ii to 3
+    jle(.SLOOP3X4I)                    // if ii <= 3, jump to beginning
+                                       // of ii loop; otherwise, loop ends.
+
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [n_iter] "m" (n_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 6;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_6x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_3x16n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rdx)                   // load address of a.
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+
+    mov(var(b), r14)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
     lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-
-    mov(var(c), r12)                   // load address of c
-
-    // r12 = rcx = c
-    // rdx = rax = a
-    // r14 = rbx = b
-    // r9  = unused
-    // r15 = n dim index jj
-
-    mov(var(n_iter), r15)              // jj = n_iter;
-
-    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
-
-                                       // zen2 can execute 4 vxorpd ipc with
-                                       // a latency of 1 cycle
-
-    vxorps(ymm4,  ymm4,  ymm4)
-    vxorps(ymm7,  ymm7,  ymm7)
-    vxorps(ymm10, ymm10, ymm10)
-    vxorps(ymm13, ymm13, ymm13)
-
-    lea(mem(r12), rcx)                 // rcx = c_iijj;
-    lea(mem(rdx), rax)                 // rax = a_ii;
-    lea(mem(r14), rbx)                 // rbx = b_jj;
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
-    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
-
-    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
+
+    mov(var(c), r12)                   // load address of c
+
+    // r12 = rcx = c
+    // rdx = rax = a
+    // r14 = rbx = b
+    // r9  = unused
+    // r15 = n dim index jj
+
+    mov(var(n_iter), r15)              // jj = n_iter;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm6,  ymm6,  ymm6)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm9,  ymm9,  ymm9)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm12, ymm12, ymm12)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+    vxorps(ymm15, ymm15, ymm15)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(rdx), rax)                 // rax = a_ii;
+    lea(mem(r14), rbx)                 // rbx = b_jj;
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
+    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+    prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
+
+    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
     lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
-
-
-    mov(var(k_iter32), rsi)            // i = k_iter32;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
-                                       // contains the k_iter8 loop.
-
-    label(.SLOOPKITER32)               // MAIN LOOP
-
-    // ---------------------------------- iteration 0
-    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
-    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    // ---------------------------------- iteration 1
-    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
-    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    // ---------------------------------- iteration 2
-    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    // ---------------------------------- iteration 3
-    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
-    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
-    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER32)                 // iterate again if i != 0.
-
-    label(.SCONSIDKITER8)
-
-    mov(var(k_iter8), rsi)             // i = k_iter8;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
-                                       // considers k_left1 loop.
-                                       // else, we prepare to enter k_iter8 loop.
-
-    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-
-    vmovups(mem(rax       ), ymm0)
-    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
-
-    vmovups(mem(rbx        ), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovups(mem(rbx, r11, 1), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovups(mem(rbx, r11, 2), ymm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovups(mem(rbx, r13, 1), ymm3)
-    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKITER8)                  // iterate again if i != 0.
-
-    label(.SCONSIDKLEFT1)
-
-    mov(var(k_left1), rsi)             // i = k_left1;
-    test(rsi, rsi)                     // check i via logical AND.
-    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
-                                       // else, we prepare to enter k_left1 loop.
-
-    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
-                                       // NOTE: We must use ymm registers here bc
-                                       // using the xmm registers would zero out the
-                                       // high bits of the destination registers,
-                                       // which would destory intermediate results.
-
-    vmovss(mem(rax       ), xmm0)
-    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
-
-    vmovss(mem(rbx        ), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm4)
-
-    vmovss(mem(rbx, r11, 1), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm7)
-
-    vmovss(mem(rbx, r11, 2), xmm3)
-    vfmadd231ps(ymm0, ymm3, ymm10)
-
-    vmovss(mem(rbx, r13, 1), xmm3)
-    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
-    vfmadd231ps(ymm0, ymm3, ymm13)
-
-
-    dec(rsi)                           // i -= 1;
-    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-
-    label(.SPOSTACCUM)
-                                       // ymm4  ymm7  ymm10 ymm13  
-                                       // ymm5  ymm8  ymm11 ymm14
-                                       // ymm6  ymm9  ymm12 ymm15
-
-    vhaddps( ymm7, ymm4, ymm0 )
-    vextractf128(imm(1), ymm0, xmm1 )
-    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
-
-    vhaddps( ymm13, ymm10, ymm2 )
-    vextractf128(imm(1), ymm2, xmm1 )
-    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
-
-    vhaddps(xmm2,xmm0,xmm4)
-
-                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
-
-    mov(var(rs_c), rdi)                // load rs_c
-    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
-
-    mov(var(alpha), rax)               // load address of alpha
-    mov(var(beta), rbx)                // load address of beta
-    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
-    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-
-    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
+    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 1
+    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
+    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    // ---------------------------------- iteration 3
+    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
+    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    vmovups(mem(rax, r8, 2), ymm2)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    vmovss(mem(rax, r8, 2), xmm2)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+    vfmadd231ps(ymm2, ymm3, ymm6)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+    vfmadd231ps(ymm2, ymm3, ymm9)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+    vfmadd231ps(ymm2, ymm3, ymm12)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+    vfmadd231ps(ymm2, ymm3, ymm15)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15
+
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+
+    vhaddps( ymm9, ymm6, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm15, ymm12, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm6)
+
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+                                       // ymm6 = sum(ymm6) sum(ymm9) sum(ymm12) sum(ymm15)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
                                        // now avoid loading C if beta == 0
-    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
-    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
-    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-
-    label(.SROWSTORED)
-
-    vfmadd231ps(mem(rcx), xmm3, xmm4)
-    vmovups(xmm4, mem(rcx))
-
-    jmp(.SDONE)                        // jump to end.
-
-    label(.SBETAZERO)
-
-    label(.SROWSTORBZ)
-
-    vmovups(xmm4, mem(rcx))
-
-    label(.SDONE)
-
-    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
-
-    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
-
-    dec(r15)                           // jj -= 1;
-    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
-
-    label(.SRETURN)
-
-    end_asm(
-    : // output operands (none)
-    : // input operands
-      [n_iter] "m" (n_iter),
-      [k_iter32] "m" (k_iter32),
-      [k_iter8] "m" (k_iter8),
-      [k_left1] "m" (k_left1),
-      [a]      "m" (a),
-      [rs_a]   "m" (rs_a),
-      [cs_a]   "m" (cs_a),
-      [b]      "m" (b),
-      [rs_b]   "m" (rs_b),
-      [cs_b]   "m" (cs_b),
-      [alpha]  "m" (alpha),
-      [beta]   "m" (beta),
-      [c]      "m" (c),
-      [rs_c]   "m" (rs_c),
-      [cs_c]   "m" (cs_c)/*,
-      [a_next] "m" (a_next),
-      [b_next] "m" (b_next)*/
-    : // register clobber list
-      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
-      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
-      "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11",
-      "xmm12", "xmm13", "xmm14", "xmm15",
-      "memory"
-    )
-
-    consider_edge_cases:
-
-    // Handle edge cases in the m dimension, if they exist.
-    if ( n_left )
-    {
-        const dim_t      mr_cur = 1;
-        const dim_t      j_edge = n0 - ( dim_t )n_left;
-
-        float* restrict cij = c + j_edge*cs_c;
-        float* restrict ai  = a;
-        float* restrict bj  = b + j_edge*cs_b;
-
-        if ( 2 <= n_left )
-        {
-            const dim_t nr_cur = 2;
-
-            bli_sgemmsup_rd_zen_asm_1x2
-            (
-              conja, conjb, mr_cur, nr_cur, k0,
-              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
-              beta, cij, rs_c0, cs_c0, data, cntx
-            );
-            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
-        }
-        if ( 1 == n_left )
-        {
-            bli_sdotxv_ex
-            (
-              conja, conjb, k0,
-              alpha, ai, cs_a0, bj, rs_b0,
-              beta, cij, cntx, NULL
-            );
-        }
-    }
-}
-
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm6)
+    vmovups(xmm6, mem(rcx))
+
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm6, mem(rcx))
+
+    label(.SDONE)
+
+    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
+
+    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
+
+    dec(r15)                           // jj -= 1;
+    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
+
+    label(.SRETURN)
+
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [n_iter] "m" (n_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 3;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_3x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_2x16n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rdx)                   // load address of a.
+    mov(var(rs_a), r8)                 // load rs_a
+    lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
+
+    mov(var(b), r14)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+
+    mov(var(c), r12)                   // load address of c
+
+    // r12 = rcx = c
+    // rdx = rax = a
+    // r14 = rbx = b
+    // r9  = unused
+    // r15 = n dim index jj
+
+    mov(var(n_iter), r15)              // jj = n_iter;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm5,  ymm5,  ymm5)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm8,  ymm8,  ymm8)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm11, ymm11, ymm11)
+    vxorps(ymm13, ymm13, ymm13)
+    vxorps(ymm14, ymm14, ymm14)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(rdx), rax)                 // rax = a_ii;
+    lea(mem(r14), rbx)                 // rbx = b_jj;
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
+    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+
+    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
+    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
+    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    // ---------------------------------- iteration 1
+    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
+    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    // ---------------------------------- iteration 3
+    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
+    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    vmovups(mem(rax       ), ymm0)
+    vmovups(mem(rax, r8, 1), ymm1)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    vmovss(mem(rax, r8, 1), xmm1)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+    vfmadd231ps(ymm1, ymm3, ymm5)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+    vfmadd231ps(ymm1, ymm3, ymm8)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+    vfmadd231ps(ymm1, ymm3, ymm11)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+    vfmadd231ps(ymm1, ymm3, ymm14)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15
+
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+    vhaddps( ymm8, ymm5, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )
+
+    vhaddps( ymm14, ymm11, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )
+
+    vhaddps(xmm2,xmm0,xmm5)
+
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+                                       // ymm5 = sum(ymm5) sum(ymm8) sum(ymm11) sum(ymm14)
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+    vmulps(xmm0, xmm5, xmm5)
+    vmulps(xmm0, xmm6, xmm6)
+
+                                       // now avoid loading C if beta == 0
+
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm5)
+    vmovups(xmm5, mem(rcx))
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+    add(rdi, rcx)
+
+    vmovups(xmm5, mem(rcx))
+
+    label(.SDONE)
+
+    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
+
+    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
+
+    dec(r15)                           // jj -= 1;
+    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
+
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [n_iter] "m" (n_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 2;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_1x16n
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*    restrict alpha,
+       float*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       float*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       float*    restrict beta,
+       float*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    //void*    a_next = bli_auxinfo_next_a( data );
+    //void*    b_next = bli_auxinfo_next_b( data );
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter32 = k0 / 32;
+    uint64_t k_left32 = k0 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rdx)                   // load address of a.
+
+    mov(var(b), r14)                   // load address of b.
+    mov(var(cs_b), r11)                // load cs_b
+    lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
+
+    lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
+
+    mov(var(c), r12)                   // load address of c
+
+    // r12 = rcx = c
+    // rdx = rax = a
+    // r14 = rbx = b
+    // r9  = unused
+    // r15 = n dim index jj
+
+    mov(var(n_iter), r15)              // jj = n_iter;
+
+    label(.SLOOP3X4J)                  // LOOP OVER jj = [ n_iter ... 1 0 ]
+
+                                       // zen2 can execute 4 vxorpd ipc with
+                                       // a latency of 1 cycle
+
+    vxorps(ymm4,  ymm4,  ymm4)
+    vxorps(ymm7,  ymm7,  ymm7)
+    vxorps(ymm10, ymm10, ymm10)
+    vxorps(ymm13, ymm13, ymm13)
+
+    lea(mem(r12), rcx)                 // rcx = c_iijj;
+    lea(mem(rdx), rax)                 // rax = a_ii;
+    lea(mem(r14), rbx)                 // rbx = b_jj;
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+    prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
+    prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
+
+    lea(mem(r11, r11, 2), rdi)         // rdi = 3*cs_b
+    lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
+
+
+    mov(var(k_iter32), rsi)            // i = k_iter32;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKITER8)                 // if i == 0, jump to code that
+                                       // contains the k_iter8 loop.
+
+    label(.SLOOPKITER32)               // MAIN LOOP
+
+    // ---------------------------------- iteration 0
+    prefetch(0, mem(r10,         0*8)) // prefetch rbx + 4*cs_b
+    prefetch(0, mem(r10, r11, 1, 0*8)) // prefetch rbx + 5*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    // ---------------------------------- iteration 1
+    prefetch(0, mem(r10, r11, 2, 0*8)) // prefetch rbx + 6*cs_b
+    prefetch(0, mem(r10, r13, 1, 0*8)) // prefetch rbx + 7*cs_b
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    // ---------------------------------- iteration 2
+    prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    // ---------------------------------- iteration 3
+    prefetch(0, mem(r10, r11, 2, 8*8)) // prefetch rbx + 6*cs_b + 8*rs_b
+    prefetch(0, mem(r10, r13, 1, 8*8)) // prefetch rbx + 7*cs_b + 8*rs_b
+    add(imm(16*8), r10)                 // r10 += 8*rs_b = 8*8;
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER32)                 // iterate again if i != 0.
+
+    label(.SCONSIDKITER8)
+
+    mov(var(k_iter8), rsi)             // i = k_iter8;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
+                                       // considers k_left1 loop.
+                                       // else, we prepare to enter k_iter8 loop.
+
+    label(.SLOOPKITER8)                // EDGE LOOP (ymm)
+
+    vmovups(mem(rax       ), ymm0)
+    add(imm(8*4), rax)                 // a += 4*cs_b = 4*8;
+
+    vmovups(mem(rbx        ), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovups(mem(rbx, r11, 1), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovups(mem(rbx, r11, 2), ymm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovups(mem(rbx, r13, 1), ymm3)
+    add(imm(8*4), rbx)                 // b += 4*rs_b = 4*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKITER8)                  // iterate again if i != 0.
+
+    label(.SCONSIDKLEFT1)
+
+    mov(var(k_left1), rsi)             // i = k_left1;
+    test(rsi, rsi)                     // check i via logical AND.
+    je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
+                                       // else, we prepare to enter k_left1 loop.
+
+    label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
+                                       // NOTE: We must use ymm registers here bc
+                                       // using the xmm registers would zero out the
+                                       // high bits of the destination registers,
+                                       // which would destory intermediate results.
+
+    vmovss(mem(rax       ), xmm0)
+    add(imm(1*4), rax)                 // a += 1*cs_b = 1*8;
+
+    vmovss(mem(rbx        ), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm4)
+
+    vmovss(mem(rbx, r11, 1), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm7)
+
+    vmovss(mem(rbx, r11, 2), xmm3)
+    vfmadd231ps(ymm0, ymm3, ymm10)
+
+    vmovss(mem(rbx, r13, 1), xmm3)
+    add(imm(1*4), rbx)                 // b += 1*rs_b = 1*8;
+    vfmadd231ps(ymm0, ymm3, ymm13)
+
+
+    dec(rsi)                           // i -= 1;
+    jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
+
+    label(.SPOSTACCUM)
+                                       // ymm4  ymm7  ymm10 ymm13  
+                                       // ymm5  ymm8  ymm11 ymm14
+                                       // ymm6  ymm9  ymm12 ymm15
+
+    vhaddps( ymm7, ymm4, ymm0 )
+    vextractf128(imm(1), ymm0, xmm1 )
+    vaddps( xmm0, xmm1, xmm0 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm7)
+
+    vhaddps( ymm13, ymm10, ymm2 )
+    vextractf128(imm(1), ymm2, xmm1 )
+    vaddps( xmm2, xmm1, xmm2 )         // xmm2[0] = sum(ymm10); xmm2[1] = sum(ymm13)
+
+    vhaddps(xmm2,xmm0,xmm4)
+
+                                       // ymm4 = sum(ymm4) sum(ymm7) sum(ymm10) sum(ymm13)
+
+    mov(var(rs_c), rdi)                // load rs_c
+    lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
+
+    mov(var(alpha), rax)               // load address of alpha
+    mov(var(beta), rbx)                // load address of beta
+    vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
+    vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
+
+    vmulps(xmm0, xmm4, xmm4)           // scale by alpha
+
+                                       // now avoid loading C if beta == 0
+    vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
+    vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
+    je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
+
+    label(.SROWSTORED)
+
+    vfmadd231ps(mem(rcx), xmm3, xmm4)
+    vmovups(xmm4, mem(rcx))
+
+    jmp(.SDONE)                        // jump to end.
+
+    label(.SBETAZERO)
+
+    label(.SROWSTORBZ)
+
+    vmovups(xmm4, mem(rcx))
+
+    label(.SDONE)
+
+    add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
+
+    lea(mem(r14, r11, 4), r14)         // b_jj = r14 += 4*cs_b
+
+    dec(r15)                           // jj -= 1;
+    jne(.SLOOP3X4J)                    // iterate again if jj != 0.
+
+    label(.SRETURN)
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [n_iter] "m" (n_iter),
+      [k_iter32] "m" (k_iter32),
+      [k_iter8] "m" (k_iter8),
+      [k_left1] "m" (k_left1),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)/*,
+      [a_next] "m" (a_next),
+      [b_next] "m" (b_next)*/
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3",
+      "xmm4", "xmm5", "xmm6", "xmm7",
+      "xmm8", "xmm9", "xmm10", "xmm11",
+      "xmm12", "xmm13", "xmm14", "xmm15",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 1;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_1x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0; bj += nr_cur*cs_b0; n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sdotxv_ex
+            (
+              conja, conjb, k0,
+              alpha, ai, cs_a0, bj, rs_b0,
+              beta, cij, cntx, NULL
+            );
+        }
+    }
+}
+
diff --git a/kernels/zen/bli_kernels_zen.h b/kernels/zen/bli_kernels_zen.h
index f16aa5cc98..e6a2f33f92 100644
--- a/kernels/zen/bli_kernels_zen.h
+++ b/kernels/zen/bli_kernels_zen.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2020 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,10 +33,7 @@
 
 */
 // -- level-1m --
-PACKM_KER_PROT(double, d, packm_8xk_gen_zen)
-PACKM_KER_PROT(double, d, packm_6xk_gen_zen)
-PACKM_KER_PROT(double, d, packm_8xk_nn_zen)
-PACKM_KER_PROT(double, d, packm_6xk_nn_zen)
+// Removed - reference packm kernels are used
 
 
 // -- level-1v --
@@ -84,20 +81,26 @@ DOTXV_KER_PROT( scomplex, c, dotxv_zen_int )
 // scalv (intrinsics)
 SCALV_KER_PROT( float,    s, scalv_zen_int )
 SCALV_KER_PROT( double,   d, scalv_zen_int )
+SCALV_KER_PROT( dcomplex, z, scalv_zen_int )
 
 // scalv (intrinsics unrolled x10)
-SCALV_KER_PROT( float,    s, scalv_zen_int10 )
-SCALV_KER_PROT( double,   d, scalv_zen_int10 )
+SCALV_KER_PROT( float,      s, scalv_zen_int10 )
+SCALV_KER_PROT( double,     d, scalv_zen_int10 )
+SCALV_KER_PROT( dcomplex,   z, dscalv_zen_int10 )
 
 // swapv (intrinsics)
 SWAPV_KER_PROT(float,   s, swapv_zen_int8 )
 SWAPV_KER_PROT(double,  d, swapv_zen_int8 )
 
 // copyv (intrinsics)
-COPYV_KER_PROT( float,    s, copyv_zen_int )
-COPYV_KER_PROT( double,   d, copyv_zen_int )
+COPYV_KER_PROT( float,      s, copyv_zen_int )
+COPYV_KER_PROT( double,     d, copyv_zen_int )
+COPYV_KER_PROT( dcomplex,   z, copyv_zen_int )
 
-//
+// scal2v (intrinsics)
+SCAL2V_KER_PROT(dcomplex, z, scal2v_zen_int)
+
+// setv (intrinsics)
 SETV_KER_PROT(float,    s, setv_zen_int)
 SETV_KER_PROT(double,   d, setv_zen_int)
 
@@ -231,6 +234,17 @@ GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_1x4 )
 GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_2x2 )
 GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rv_zen_asm_1x2 )
 
+//gemmsup_rd
+
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rd_zen_asm_3x4m )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rd_zen_asm_3x2m )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rd_zen_asm_2x4 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rd_zen_asm_1x4 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rd_zen_asm_2x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rd_zen_asm_1x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rd_zen_asm_3x4n )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_rd_zen_asm_2x4n )
+
 // gemmsup_rv (mkernel in n dim)
 
 
@@ -289,7 +303,7 @@ err_t bli_zgemm_small_At
       cntl_t* cntl
     );
 
-void bli_dgemm_ref_k1_nn
+void bli_dgemm_8x6_avx2_k1_nn
     (
       dim_t m,
       dim_t n,
@@ -301,7 +315,7 @@ void bli_dgemm_ref_k1_nn
       double* c, const inc_t ldc
      );
 
-void bli_zgemm_ref_k1_nn
+void bli_zgemm_4x6_avx2_k1_nn
     (
       dim_t m,
       dim_t n,
@@ -320,7 +334,8 @@ err_t bli_trsm_small
        obj_t*  a,
        obj_t*  b,
        cntx_t* cntx,
-       cntl_t* cntl
+       cntl_t* cntl,
+       bool is_parallel
      );
 
 #ifdef BLIS_ENABLE_OPENMP
@@ -331,7 +346,8 @@ err_t bli_trsm_small_mt
        obj_t*  a,
        obj_t*  b,
        cntx_t* cntx,
-       cntl_t* cntl
+       cntl_t* cntl,
+      bool     is_parallel
      );
 
 void bli_multi_sgemv_4x2
@@ -379,6 +395,14 @@ bool bli_cntx_trsm_small_thresh_is_met_zen
         dim_t n
     );
 
+void bli_snorm2fv_unb_var1_avx2
+     (
+       dim_t    n,
+       float*   x, inc_t incx,
+       float* norm,
+       cntx_t*  cntx
+     );
+
 void bli_dnorm2fv_unb_var1_avx2
      (
        dim_t    n,
@@ -387,6 +411,14 @@ void bli_dnorm2fv_unb_var1_avx2
        cntx_t*  cntx
      );
 
+void bli_scnorm2fv_unb_var1_avx2
+     (
+       dim_t    n,
+       scomplex*   x, inc_t incx,
+       float* norm,
+       cntx_t*  cntx
+     );
+
 void bli_dznorm2fv_unb_var1_avx2
      (
        dim_t    n,
@@ -394,11 +426,3 @@ void bli_dznorm2fv_unb_var1_avx2
        double* norm,
        cntx_t*  cntx
      );
-void bli_zdscalv_zen_int10
-     (
-       conj_t           conjalpha,
-       dim_t            n,
-       double* restrict alpha,
-       dcomplex* restrict x, inc_t incx,
-       cntx_t* restrict cntx
-     );
\ No newline at end of file
diff --git a/kernels/zen/lpgemm/f32f32f32/lpgemm_fringe_f32_avx2.c b/kernels/zen/lpgemm/f32f32f32/lpgemm_fringe_f32_avx2.c
new file mode 100644
index 0000000000..ae0862d6a7
--- /dev/null
+++ b/kernels/zen/lpgemm/f32f32f32/lpgemm_fringe_f32_avx2.c
@@ -0,0 +1,5484 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+  Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "immintrin.h"
+#include "xmmintrin.h"
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_kernel_macros_f32_avx2.h"
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x16)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_5x16F_DISABLE,
+              &&POST_OPS_BIAS_5x16F,
+              &&POST_OPS_RELU_5x16F,
+              &&POST_OPS_RELU_SCALE_5x16F,
+              &&POST_OPS_GELU_TANH_5x16F,
+              &&POST_OPS_GELU_ERF_5x16F,
+              &&POST_OPS_CLIP_5x16F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm5, ymm6, ymm7;
+    __m256 ymm8, ymm9, ymm10, ymm11;
+    __m256 ymm12, ymm13;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_YMM_4_REG(ymm4, ymm5, ymm6, ymm7);
+    ZERO_ACC_YMM_4_REG(ymm8,  ymm9,  ymm10, ymm11);
+    ymm12 = _mm256_setzero_ps();
+    ymm13 = _mm256_setzero_ps();
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      ymm1 = _mm256_loadu_ps(bbuf + 8);
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm5 = _mm256_fmadd_ps(ymm1, ymm2, ymm5);
+      ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+      ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7);
+
+      ymm2 = _mm256_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r2 
+      ymm3 = _mm256_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r3
+
+      ymm8 = _mm256_fmadd_ps(ymm0, ymm2, ymm8);
+      ymm9 = _mm256_fmadd_ps(ymm1, ymm2, ymm9);
+      ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10);
+      ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11);
+
+      ymm2 = _mm256_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r4
+      abuf += cs_a;  //move a pointer to next col
+    
+      ymm12 = _mm256_fmadd_ps(ymm0, ymm2, ymm12);
+      ymm13 = _mm256_fmadd_ps(ymm1, ymm2, ymm13);
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_YMM_4_REG(ymm4,ymm5,ymm6,ymm7,ymm0)
+    ALPHA_MUL_ACC_YMM_4_REG(ymm8,ymm9,ymm10,ymm11,ymm0)
+    ALPHA_MUL_ACC_YMM_4_REG(ymm12,ymm13,ymm2,ymm3,ymm0)
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm5)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm7)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm8)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm9)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm10)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm11)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm12)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm13)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_5x16F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+        ymm1 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm1 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_add_ps( ymm7, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm0 );
+
+        // c[2,8-15]
+        ymm9 = _mm256_add_ps( ymm9, ymm1 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_add_ps( ymm10, ymm0 );
+
+        // c[3,8-15]
+        ymm11 = _mm256_add_ps( ymm11, ymm1 );
+
+        // c[4,0-7]
+        ymm12 = _mm256_add_ps( ymm12, ymm0 );
+
+        // c[4,8-15]
+        ymm13 = _mm256_add_ps( ymm13, ymm1 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        ymm2 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        ymm3 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_add_ps( ymm7, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm2 );
+
+        // c[2,8-15]
+        ymm9 = _mm256_add_ps( ymm9, ymm2 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_add_ps( ymm10, ymm3 );
+
+        // c[3,8-15]
+        ymm11 = _mm256_add_ps( ymm11, ymm3 );
+
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 4 );
+
+        // c[4,0-7]
+        ymm12 = _mm256_add_ps( ymm12, ymm0 );
+
+        // c[4,8-15]
+        ymm13 = _mm256_add_ps( ymm13, ymm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_5x16F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[0,8-15]
+      ymm5 = _mm256_max_ps( ymm5, ymm0 );
+
+      // c[1,0-7]
+      ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+      // c[1,8-15]
+      ymm7 = _mm256_max_ps( ymm7, ymm0 );
+
+      // c[2,0-7]
+      ymm8 = _mm256_max_ps( ymm8, ymm0 );
+
+      // c[2,8-15]
+      ymm9 = _mm256_max_ps( ymm9, ymm0 );
+
+      // c[3,0-7]
+      ymm10 = _mm256_max_ps( ymm10, ymm0 );
+
+      // c[3,8-15]
+      ymm11 = _mm256_max_ps( ymm11, ymm0 );
+
+      // c[4,0-7]
+      ymm12 = _mm256_max_ps( ymm12, ymm0 );
+
+      // c[4,8-15]
+      ymm13 = _mm256_max_ps( ymm13, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_5x16F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[1,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[2,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm9, ymm0, ymm1, ymm2)
+
+      // c[3,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+      // c[3,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm11, ymm0, ymm1, ymm2)
+
+      // c[4,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm12, ymm0, ymm1, ymm2)
+
+      // c[4,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm13, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_5x16F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[0,8-15]
+      GELU_TANH_F32S_AVX2(ymm5, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,0-7]
+      GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,8-15]
+      GELU_TANH_F32S_AVX2(ymm7, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,0-7]
+      GELU_TANH_F32S_AVX2(ymm8, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,8-15]
+      GELU_TANH_F32S_AVX2(ymm9, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[3,0-7]
+      GELU_TANH_F32S_AVX2(ymm10, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[3,8-15]
+      GELU_TANH_F32S_AVX2(ymm11, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[4,0-7]
+      GELU_TANH_F32S_AVX2(ymm12, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[4,8-15]
+      GELU_TANH_F32S_AVX2(ymm13, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_5x16F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      GELU_ERF_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[1,8-15]
+      GELU_ERF_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      GELU_ERF_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[2,8-15]
+      GELU_ERF_F32S_AVX2(ymm9, ymm0, ymm1, ymm2)
+
+      // c[3,0-7]
+      GELU_ERF_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+      // c[3,8-15]
+      GELU_ERF_F32S_AVX2(ymm11, ymm0, ymm1, ymm2)
+
+      // c[4,0-7]
+      GELU_ERF_F32S_AVX2(ymm12, ymm0, ymm1, ymm2)
+
+      // c[4,8-15]
+      GELU_ERF_F32S_AVX2(ymm13, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_5x16F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[0,8-15]
+      CLIP_F32S_AVX2(ymm5, ymm0, ymm1)
+
+      // c[1,0-7]
+      CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+      // c[1,8-15]
+      CLIP_F32S_AVX2(ymm7, ymm0, ymm1)
+
+      // c[2,0-7]
+      CLIP_F32S_AVX2(ymm8, ymm0, ymm1)
+
+      // c[2,8-15]
+      CLIP_F32S_AVX2(ymm9, ymm0, ymm1)
+
+      // c[3,0-7]
+      CLIP_F32S_AVX2(ymm10, ymm0, ymm1)
+
+      // c[3,8-15]
+      CLIP_F32S_AVX2(ymm11, ymm0, ymm1)
+
+      // c[4,0-7]
+      CLIP_F32S_AVX2(ymm12, ymm0, ymm1)
+
+      // c[4,8-15]
+      CLIP_F32S_AVX2(ymm13, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_5x16F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    _mm256_storeu_ps(cbuf + 8, ymm5);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm6); 
+    _mm256_storeu_ps(cbuf + 8, ymm7);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm8); 
+    _mm256_storeu_ps(cbuf + 8, ymm9);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm10); 
+    _mm256_storeu_ps(cbuf + 8, ymm11);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm12); 
+    _mm256_storeu_ps(cbuf + 8, ymm13);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x16)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_4x16F_DISABLE,
+              &&POST_OPS_BIAS_4x16F,
+              &&POST_OPS_RELU_4x16F,
+              &&POST_OPS_RELU_SCALE_4x16F,
+              &&POST_OPS_GELU_TANH_4x16F,
+              &&POST_OPS_GELU_ERF_4x16F,
+              &&POST_OPS_CLIP_4x16F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm5, ymm6, ymm7;
+    __m256 ymm8, ymm9, ymm10, ymm11;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_YMM_4_REG(ymm4, ymm5, ymm6, ymm7);
+    ZERO_ACC_YMM_4_REG(ymm8, ymm9,  ymm10, ymm11);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      ymm1 = _mm256_loadu_ps(bbuf + 8);
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm5 = _mm256_fmadd_ps(ymm1, ymm2, ymm5);
+      ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+      ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7);
+
+      ymm2 = _mm256_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r2 
+      ymm3 = _mm256_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r3
+
+      ymm8 = _mm256_fmadd_ps(ymm0, ymm2, ymm8);
+      ymm9 = _mm256_fmadd_ps(ymm1, ymm2, ymm9);
+      ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10);
+      ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11);
+
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_YMM_4_REG(ymm4,ymm5,ymm6,ymm7,ymm0)
+    ALPHA_MUL_ACC_YMM_4_REG(ymm8,ymm9,ymm10,ymm11,ymm0)
+
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm5)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm7)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm8)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm9)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm10)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm11)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_4x16F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+        ymm1 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm1 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_add_ps( ymm7, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm0 );
+
+        // c[2,8-15]
+        ymm9 = _mm256_add_ps( ymm9, ymm1 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_add_ps( ymm10, ymm0 );
+
+        // c[3,8-15]
+        ymm11 = _mm256_add_ps( ymm11, ymm1 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        ymm2 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        ymm3 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_add_ps( ymm7, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm2 );
+
+        // c[2,8-15]
+        ymm9 = _mm256_add_ps( ymm9, ymm2 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_add_ps( ymm10, ymm3 );
+
+        // c[3,8-15]
+        ymm11 = _mm256_add_ps( ymm11, ymm3 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_4x16F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[0,8-15]
+      ymm5 = _mm256_max_ps( ymm5, ymm0 );
+
+      // c[1,0-7]
+      ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+      // c[1,8-15]
+      ymm7 = _mm256_max_ps( ymm7, ymm0 );
+
+      // c[2,0-7]
+      ymm8 = _mm256_max_ps( ymm8, ymm0 );
+
+      // c[2,8-15]
+      ymm9 = _mm256_max_ps( ymm9, ymm0 );
+
+      // c[3,0-7]
+      ymm10 = _mm256_max_ps( ymm10, ymm0 );
+
+      // c[3,8-15]
+      ymm11 = _mm256_max_ps( ymm11, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_4x16F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[1,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[2,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm9, ymm0, ymm1, ymm2)
+
+      // c[3,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+      // c[3,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm11, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_4x16F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[0,8-15]
+      GELU_TANH_F32S_AVX2(ymm5, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,0-7]
+      GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,8-15]
+      GELU_TANH_F32S_AVX2(ymm7, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,0-7]
+      GELU_TANH_F32S_AVX2(ymm8, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,8-15]
+      GELU_TANH_F32S_AVX2(ymm9, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[3,0-7]
+      GELU_TANH_F32S_AVX2(ymm10, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[3,8-15]
+      GELU_TANH_F32S_AVX2(ymm11, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_4x16F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      GELU_ERF_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[1,8-15]
+      GELU_ERF_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      GELU_ERF_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[2,8-15]
+      GELU_ERF_F32S_AVX2(ymm9, ymm0, ymm1, ymm2)
+
+      // c[3,0-7]
+      GELU_ERF_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+      // c[3,8-15]
+      GELU_ERF_F32S_AVX2(ymm11, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_4x16F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[0,8-15]
+      CLIP_F32S_AVX2(ymm5, ymm0, ymm1)
+
+      // c[1,0-7]
+      CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+      // c[1,8-15]
+      CLIP_F32S_AVX2(ymm7, ymm0, ymm1)
+
+      // c[2,0-7]
+      CLIP_F32S_AVX2(ymm8, ymm0, ymm1)
+
+      // c[2,8-15]
+      CLIP_F32S_AVX2(ymm9, ymm0, ymm1)
+
+      // c[3,0-7]
+      CLIP_F32S_AVX2(ymm10, ymm0, ymm1)
+
+      // c[3,8-15]
+      CLIP_F32S_AVX2(ymm11, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_4x16F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    _mm256_storeu_ps(cbuf + 8, ymm5);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm6); 
+    _mm256_storeu_ps(cbuf + 8, ymm7);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm8); 
+    _mm256_storeu_ps(cbuf + 8, ymm9);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm10); 
+    _mm256_storeu_ps(cbuf + 8, ymm11);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x16)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_3x16F_DISABLE,
+              &&POST_OPS_BIAS_3x16F,
+              &&POST_OPS_RELU_3x16F,
+              &&POST_OPS_RELU_SCALE_3x16F,
+              &&POST_OPS_GELU_TANH_3x16F,
+              &&POST_OPS_GELU_ERF_3x16F,
+              &&POST_OPS_CLIP_3x16F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm5, ymm6, ymm7;
+    __m256 ymm8, ymm9;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_YMM_4_REG(ymm4, ymm5, ymm6, ymm7);
+    ymm8 = _mm256_setzero_ps();
+    ymm9 = _mm256_setzero_ps();
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      ymm1 = _mm256_loadu_ps(bbuf + 8);
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm5 = _mm256_fmadd_ps(ymm1, ymm2, ymm5);
+      ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+      ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7);
+
+      ymm2 = _mm256_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r2 
+      ymm8 = _mm256_fmadd_ps(ymm0, ymm2, ymm8);
+      ymm9 = _mm256_fmadd_ps(ymm1, ymm2, ymm9);
+
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_YMM_4_REG(ymm4,ymm5,ymm6,ymm7,ymm0)
+    ALPHA_MUL_ACC_YMM_4_REG(ymm8,ymm9,ymm2,ymm3,ymm0)
+
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm5)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm7)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm8)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm9)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_3x16F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+        ymm1 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm1 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_add_ps( ymm7, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm0 );
+
+        // c[2,8-15]
+        ymm9 = _mm256_add_ps( ymm9, ymm1 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        ymm2 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_add_ps( ymm7, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm2 );
+
+        // c[2,8-15]
+        ymm9 = _mm256_add_ps( ymm9, ymm2 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_3x16F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[0,8-15]
+      ymm5 = _mm256_max_ps( ymm5, ymm0 );
+
+      // c[1,0-7]
+      ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+      // c[1,8-15]
+      ymm7 = _mm256_max_ps( ymm7, ymm0 );
+
+      // c[2,0-7]
+      ymm8 = _mm256_max_ps( ymm8, ymm0 );
+
+      // c[2,8-15]
+      ymm9 = _mm256_max_ps( ymm9, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_3x16F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[1,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[2,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm9, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_3x16F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[0,8-15]
+      GELU_TANH_F32S_AVX2(ymm5, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,0-7]
+      GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,8-15]
+      GELU_TANH_F32S_AVX2(ymm7, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,0-7]
+      GELU_TANH_F32S_AVX2(ymm8, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,8-15]
+      GELU_TANH_F32S_AVX2(ymm9, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_3x16F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      GELU_ERF_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[1,8-15]
+      GELU_ERF_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      GELU_ERF_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[2,8-15]
+      GELU_ERF_F32S_AVX2(ymm9, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_3x16F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[0,8-15]
+      CLIP_F32S_AVX2(ymm5, ymm0, ymm1)
+
+      // c[1,0-7]
+      CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+      // c[1,8-15]
+      CLIP_F32S_AVX2(ymm7, ymm0, ymm1)
+
+      // c[2,0-7]
+      CLIP_F32S_AVX2(ymm8, ymm0, ymm1)
+
+      // c[2,8-15]
+      CLIP_F32S_AVX2(ymm9, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_3x16F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    _mm256_storeu_ps(cbuf + 8, ymm5);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm6); 
+    _mm256_storeu_ps(cbuf + 8, ymm7);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm8); 
+    _mm256_storeu_ps(cbuf + 8, ymm9);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x16)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_2x16F_DISABLE,
+              &&POST_OPS_BIAS_2x16F,
+              &&POST_OPS_RELU_2x16F,
+              &&POST_OPS_RELU_SCALE_2x16F,
+              &&POST_OPS_GELU_TANH_2x16F,
+              &&POST_OPS_GELU_ERF_2x16F,
+              &&POST_OPS_CLIP_2x16F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm5, ymm6, ymm7;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_YMM_4_REG(ymm4, ymm5, ymm6, ymm7);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      ymm1 = _mm256_loadu_ps(bbuf + 8);
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm5 = _mm256_fmadd_ps(ymm1, ymm2, ymm5);
+      ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+      ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7);
+
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_YMM_4_REG(ymm4,ymm5,ymm6,ymm7,ymm0)
+
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm5)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+      F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm7)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_2x16F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+        ymm1 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm1 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_add_ps( ymm7, ymm1 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_add_ps( ymm7, ymm1 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_2x16F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[0,8-15]
+      ymm5 = _mm256_max_ps( ymm5, ymm0 );
+
+      // c[1,0-7]
+      ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+      // c[1,8-15]
+      ymm7 = _mm256_max_ps( ymm7, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_2x16F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[1,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_2x16F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[0,8-15]
+      GELU_TANH_F32S_AVX2(ymm5, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,0-7]
+      GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,8-15]
+      GELU_TANH_F32S_AVX2(ymm7, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_2x16F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      GELU_ERF_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[1,8-15]
+      GELU_ERF_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_2x16F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[0,8-15]
+      CLIP_F32S_AVX2(ymm5, ymm0, ymm1)
+
+      // c[1,0-7]
+      CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+      // c[1,8-15]
+      CLIP_F32S_AVX2(ymm7, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_2x16F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    _mm256_storeu_ps(cbuf + 8, ymm5);
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm6); 
+    _mm256_storeu_ps(cbuf + 8, ymm7);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x16)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_1x16F_DISABLE,
+              &&POST_OPS_BIAS_1x16F,
+              &&POST_OPS_RELU_1x16F,
+              &&POST_OPS_RELU_SCALE_1x16F,
+              &&POST_OPS_GELU_TANH_1x16F,
+              &&POST_OPS_GELU_ERF_1x16F,
+              &&POST_OPS_CLIP_1x16F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm5;
+
+    /* zero the accumulator registers */
+    ymm4 = _mm256_setzero_ps();
+    ymm5 = _mm256_setzero_ps();
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      ymm1 = _mm256_loadu_ps(bbuf + 8);
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm5 = _mm256_fmadd_ps(ymm1, ymm2, ymm5);
+
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ymm4 = _mm256_mul_ps(ymm4,ymm0);
+    ymm5 = _mm256_mul_ps(ymm5,ymm0);
+
+
+    if ( beta != 0.0 )
+    {
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+
+      F32_C_BNZ_8(cbuf,rs_c,ymm0,ymm3,ymm4)
+      F32_C_BNZ_8(cbuf+8,rs_c,ymm1,ymm3,ymm5)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_1x16F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+        ymm1 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm1 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_add_ps( ymm5, ymm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_1x16F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[0,8-15]
+      ymm5 = _mm256_max_ps( ymm5, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_1x16F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      RELU_SCALE_OP_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_1x16F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[0,8-15]
+      GELU_TANH_F32S_AVX2(ymm5, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_1x16F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[0,8-15]
+      GELU_ERF_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_1x16F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[0,8-15]
+      CLIP_F32S_AVX2(ymm5, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_1x16F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    _mm256_storeu_ps(cbuf + 8, ymm5);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x8)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_5x8F_DISABLE,
+              &&POST_OPS_BIAS_5x8F,
+              &&POST_OPS_RELU_5x8F,
+              &&POST_OPS_RELU_SCALE_5x8F,
+              &&POST_OPS_GELU_TANH_5x8F,
+              &&POST_OPS_GELU_ERF_5x8F,
+              &&POST_OPS_CLIP_5x8F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm6, ymm8, ymm10;
+    __m256 ymm12;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_YMM_4_REG(ymm4, ymm6, ymm2, ymm3);
+    ZERO_ACC_YMM_4_REG(ymm8, ymm10, ymm12, ymm0);    
+      
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+      
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1  
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+
+      ymm2 = _mm256_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r2 
+      ymm3 = _mm256_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r3
+
+      ymm8 = _mm256_fmadd_ps(ymm0, ymm2, ymm8);
+      ymm2 = _mm256_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r4
+      
+      ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10);
+      ymm12 = _mm256_fmadd_ps(ymm0, ymm2, ymm12);
+
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_YMM_4_REG(ymm4,ymm6,ymm8,ymm10,ymm0)
+    ymm12 = _mm256_mul_ps(ymm12,ymm0);
+
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm8)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm10)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm12)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_5x8F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm0 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_add_ps( ymm10, ymm0 );
+
+        // c[4,0-7]
+        ymm12 = _mm256_add_ps( ymm12, ymm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        ymm2 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        ymm3 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm2 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_add_ps( ymm10, ymm3 );
+
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 4 );
+
+        // c[4,0-7]
+        ymm12 = _mm256_add_ps( ymm12, ymm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_5x8F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[1,0-7]
+      ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+      // c[2,0-7]
+      ymm8 = _mm256_max_ps( ymm8, ymm0 );
+
+      // c[3,0-7]
+      ymm10 = _mm256_max_ps( ymm10, ymm0 );
+
+      // c[4,0-7]
+      ymm12 = _mm256_max_ps( ymm12, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_5x8F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[3,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+      // c[4,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm12, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_5x8F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,0-7]
+      GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,0-7]
+      GELU_TANH_F32S_AVX2(ymm8, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[3,0-7]
+      GELU_TANH_F32S_AVX2(ymm10, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[4,0-7]
+      GELU_TANH_F32S_AVX2(ymm12, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_5x8F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      GELU_ERF_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[3,0-7]
+      GELU_ERF_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+      // c[4,0-7]
+      GELU_ERF_F32S_AVX2(ymm12, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_5x8F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[1,0-7]
+      CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+      // c[2,0-7]
+      CLIP_F32S_AVX2(ymm8, ymm0, ymm1)
+
+      // c[3,0-7]
+      CLIP_F32S_AVX2(ymm10, ymm0, ymm1)
+
+      // c[4,0-7]
+      CLIP_F32S_AVX2(ymm12, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_5x8F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm6); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm8); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm10); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm12);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x8)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_4x8F_DISABLE,
+              &&POST_OPS_BIAS_4x8F,
+              &&POST_OPS_RELU_4x8F,
+              &&POST_OPS_RELU_SCALE_4x8F,
+              &&POST_OPS_GELU_TANH_4x8F,
+              &&POST_OPS_GELU_ERF_4x8F,
+              &&POST_OPS_CLIP_4x8F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm6, ymm8, ymm10;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_YMM_4_REG(ymm4, ymm6, ymm8, ymm10);
+      
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+      
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1  
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+
+      ymm2 = _mm256_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r2 
+      ymm3 = _mm256_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r3
+
+      ymm8 = _mm256_fmadd_ps(ymm0, ymm2, ymm8);
+      ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10);
+
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_YMM_4_REG(ymm4,ymm6,ymm8,ymm10,ymm0)
+
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm8)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm10)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_4x8F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm0 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_add_ps( ymm10, ymm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        ymm2 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        ymm3 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm2 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_add_ps( ymm10, ymm3 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_4x8F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[1,0-7]
+      ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+      // c[2,0-7]
+      ymm8 = _mm256_max_ps( ymm8, ymm0 );
+
+      // c[3,0-7]
+      ymm10 = _mm256_max_ps( ymm10, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_4x8F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[3,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_4x8F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,0-7]
+      GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,0-7]
+      GELU_TANH_F32S_AVX2(ymm8, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[3,0-7]
+      GELU_TANH_F32S_AVX2(ymm10, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_4x8F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      GELU_ERF_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      // c[3,0-7]
+      GELU_ERF_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_4x8F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[1,0-7]
+      CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+      // c[2,0-7]
+      CLIP_F32S_AVX2(ymm8, ymm0, ymm1)
+
+      // c[3,0-7]
+      CLIP_F32S_AVX2(ymm10, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_4x8F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm6); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm8); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm10);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x8)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_3x8F_DISABLE,
+              &&POST_OPS_BIAS_3x8F,
+              &&POST_OPS_RELU_3x8F,
+              &&POST_OPS_RELU_SCALE_3x8F,
+              &&POST_OPS_GELU_TANH_3x8F,
+              &&POST_OPS_GELU_ERF_3x8F,
+              &&POST_OPS_CLIP_3x8F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm6, ymm8;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_YMM_4_REG(ymm4, ymm6, ymm2, ymm8);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1  
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm2 = _mm256_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r2 
+
+      ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+      ymm8 = _mm256_fmadd_ps(ymm0, ymm2, ymm8);
+
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ymm4 = _mm256_mul_ps(ymm4,ymm0);
+    ymm6 = _mm256_mul_ps(ymm6,ymm0);
+    ymm8 = _mm256_mul_ps(ymm8,ymm0);
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm8)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_3x8F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        ymm2 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_add_ps( ymm8, ymm2 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_3x8F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[1,0-7]
+      ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+      // c[2,0-7]
+      ymm8 = _mm256_max_ps( ymm8, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_3x8F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_3x8F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,0-7]
+      GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[2,0-7]
+      GELU_TANH_F32S_AVX2(ymm8, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_3x8F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      // c[2,0-7]
+      GELU_ERF_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_3x8F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[1,0-7]
+      CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+      // c[2,0-7]
+      CLIP_F32S_AVX2(ymm8, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_3x8F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm6); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm8);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x8)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_2x8F_DISABLE,
+              &&POST_OPS_BIAS_2x8F,
+              &&POST_OPS_RELU_2x8F,
+              &&POST_OPS_RELU_SCALE_2x8F,
+              &&POST_OPS_GELU_TANH_2x8F,
+              &&POST_OPS_GELU_ERF_2x8F,
+              &&POST_OPS_CLIP_2x8F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm6;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_YMM_4_REG(ymm4, ymm6, ymm2, ymm3);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1  
+
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+      ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+        
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ymm4 = _mm256_mul_ps(ymm4,ymm0);
+    ymm6 = _mm256_mul_ps(ymm6,ymm0);
+
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+      _cbuf += rs_c;
+      F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_2x8F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_add_ps( ymm6, ymm1 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_2x8F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      // c[1,0-7]
+      ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_2x8F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_2x8F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      // c[1,0-7]
+      GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_2x8F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      // c[1,0-7]
+      GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_2x8F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      // c[1,0-7]
+      CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_2x8F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+    cbuf += rs_c;
+    _mm256_storeu_ps(cbuf, ymm6);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x8)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_1x8F_DISABLE,
+              &&POST_OPS_BIAS_1x8F,
+              &&POST_OPS_RELU_1x8F,
+              &&POST_OPS_RELU_SCALE_1x8F,
+              &&POST_OPS_GELU_TANH_1x8F,
+              &&POST_OPS_GELU_ERF_1x8F,
+              &&POST_OPS_CLIP_1x8F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4;
+
+    /* zero the accumulator registers */
+    ymm4 = _mm256_setzero_ps();
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+      /*Load 16 elements from row0 of B*/
+      ymm0 = _mm256_loadu_ps(bbuf );
+      bbuf += rs_b;  //move b pointer to next row
+
+      ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+      ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+
+      abuf += cs_a;  //move a pointer to next col
+    }//kloop
+
+    ymm0 = _mm256_broadcast_ss(&(alpha));
+    ymm4 = _mm256_mul_ps(ymm4,ymm0);
+
+
+    if ( beta != 0.0 )
+    {
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      ymm3 = _mm256_broadcast_ss(&(beta));
+      F32_C_BNZ_8(cbuf,rs_c,ymm0,ymm3,ymm4)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_1x8F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+
+        // c[0,0-7]
+        ymm4 = _mm256_add_ps( ymm4, ymm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_1x8F:
+    {
+      ymm0 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_1x8F:
+    {
+      ymm0 =
+        _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_setzero_ps();
+
+      // c[0,0-7]
+      RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_1x8F:
+    {
+      __m256 dn, x_tanh;
+      __m256i q;
+
+      // c[0,0-7]
+      GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_1x8F:
+    {
+      // c[0,0-7]
+      GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_1x8F:
+    {
+      ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-7]
+      CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_1x8F_DISABLE:
+    ;
+
+    _mm256_storeu_ps(cbuf, ymm4); 
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x4)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_5x4F_DISABLE,
+              &&POST_OPS_BIAS_5x4F,
+              &&POST_OPS_RELU_5x4F,
+              &&POST_OPS_RELU_SCALE_5x4F,
+              &&POST_OPS_GELU_TANH_5x4F,
+              &&POST_OPS_GELU_ERF_5x4F,
+              &&POST_OPS_CLIP_5x4F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+    
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    __m128 xmm8, xmm9;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    ZERO_ACC_XMM_4_REG(xmm8,xmm9,xmm0,xmm1) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_loadu_ps(bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+        xmm8 = _mm_fmadd_ps(xmm0, xmm2, xmm8);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0) 
+    ALPHA_MUL_ACC_XMM_4_REG(xmm8,xmm9,xmm2,xmm3,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm7)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm8)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_5x4F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm0 );
+
+        // c[4,0-3]
+        xmm8 = _mm_add_ps( xmm8, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm3 );
+
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 4 );
+
+        // c[4,0-3]
+        xmm8 = _mm_add_ps( xmm8, xmm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_5x4F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      // c[3,0-3]
+      xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+      // c[4,0-3]
+      xmm8 = _mm_max_ps( xmm8, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_5x4F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      // c[4,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_5x4F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[3,0-3]
+      GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[4,0-3]
+      GELU_TANH_F32S_SSE(xmm8, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_5x4F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      // c[4,0-3]
+      GELU_ERF_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_5x4F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      // c[3,0-3]
+      CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+      // c[4,0-3]
+      CLIP_F32S_SSE(xmm8, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_5x4F_DISABLE:
+    ;
+
+    _mm_storeu_ps(cbuf, xmm4);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm5);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm6);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm7);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm8);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x4)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_4x4F_DISABLE,
+              &&POST_OPS_BIAS_4x4F,
+              &&POST_OPS_RELU_4x4F,
+              &&POST_OPS_RELU_SCALE_4x4F,
+              &&POST_OPS_GELU_TANH_4x4F,
+              &&POST_OPS_GELU_ERF_4x4F,
+              &&POST_OPS_CLIP_4x4F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_loadu_ps(bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm7)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_4x4F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm3 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_4x4F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      // c[3,0-3]
+      xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_4x4F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_4x4F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[3,0-3]
+      GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_4x4F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_4x4F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      // c[3,0-3]
+      CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_4x4F_DISABLE:
+    ;
+
+    _mm_storeu_ps(cbuf, xmm4);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm5);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm6);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm7);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x4)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_3x4F_DISABLE,
+              &&POST_OPS_BIAS_3x4F,
+              &&POST_OPS_RELU_3x4F,
+              &&POST_OPS_RELU_SCALE_3x4F,
+              &&POST_OPS_GELU_TANH_3x4F,
+              &&POST_OPS_GELU_ERF_3x4F,
+              &&POST_OPS_CLIP_3x4F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_loadu_ps(bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm6)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_3x4F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_3x4F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_3x4F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_3x4F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_3x4F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_3x4F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_3x4F_DISABLE:
+    ;
+
+    _mm_storeu_ps(cbuf, xmm4);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm5);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm6);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x4)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_2x4F_DISABLE,
+              &&POST_OPS_BIAS_2x4F,
+              &&POST_OPS_RELU_2x4F,
+              &&POST_OPS_RELU_SCALE_2x4F,
+              &&POST_OPS_GELU_TANH_2x4F,
+              &&POST_OPS_GELU_ERF_2x4F,
+              &&POST_OPS_CLIP_2x4F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5;
+
+    /* zero the accumulator registers */
+    xmm4 = _mm_setzero_ps();
+    xmm5 = _mm_setzero_ps();
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_loadu_ps(bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    xmm4 = _mm_mul_ps(xmm4,xmm0);
+    xmm5 = _mm_mul_ps(xmm5,xmm0);
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_4(_cbuf,rs_c,xmm0,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm0,xmm3,xmm5)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_2x4F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_2x4F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_2x4F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_2x4F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_2x4F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_2x4F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_2x4F_DISABLE:
+    ;
+
+    _mm_storeu_ps(cbuf, xmm4);
+    cbuf += rs_c;
+    _mm_storeu_ps(cbuf, xmm5);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x4)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_1x4F_DISABLE,
+              &&POST_OPS_BIAS_1x4F,
+              &&POST_OPS_RELU_1x4F,
+              &&POST_OPS_RELU_SCALE_1x4F,
+              &&POST_OPS_GELU_TANH_1x4F,
+              &&POST_OPS_GELU_ERF_1x4F,
+              &&POST_OPS_CLIP_1x4F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4;
+    
+    /* zero the accumulator registers */
+    xmm4 = _mm_setzero_ps();
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_loadu_ps(bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    xmm4 = _mm_mul_ps(xmm4,xmm0);
+
+
+    if ( beta != 0.0 )
+    {
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+        F32_C_BNZ_4(cbuf,rs_c,xmm0,xmm3,xmm4)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_1x4F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_1x4F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_1x4F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_1x4F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_1x4F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_1x4F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_1x4F_DISABLE:
+    ;
+
+    _mm_storeu_ps(cbuf, xmm4);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x2)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_5x2F_DISABLE,
+              &&POST_OPS_BIAS_5x2F,
+              &&POST_OPS_RELU_5x2F,
+              &&POST_OPS_RELU_SCALE_5x2F,
+              &&POST_OPS_GELU_TANH_5x2F,
+              &&POST_OPS_GELU_ERF_5x2F,
+              &&POST_OPS_CLIP_5x2F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    __m128 xmm8, xmm9;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    ZERO_ACC_XMM_4_REG(xmm8,xmm9,xmm0,xmm1) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = ( __m128 )_mm_load_sd((const double*)bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+        xmm8 = _mm_fmadd_ps(xmm0, xmm2, xmm8);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0) 
+    ALPHA_MUL_ACC_XMM_4_REG(xmm8,xmm9,xmm2,xmm3,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm7)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm8)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_5x2F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm0 );
+
+        // c[4,0-3]
+        xmm8 = _mm_add_ps( xmm8, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm3 );
+
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 4 );
+
+        // c[4,0-3]
+        xmm8 = _mm_add_ps( xmm8, xmm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_5x2F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      // c[3,0-3]
+      xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+      // c[4,0-3]
+      xmm8 = _mm_max_ps( xmm8, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_5x2F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      // c[4,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_5x2F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[3,0-3]
+      GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[4,0-3]
+      GELU_TANH_F32S_SSE(xmm8, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_5x2F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      // c[4,0-3]
+      GELU_ERF_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_5x2F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      // c[3,0-3]
+      CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+      // c[4,0-3]
+      CLIP_F32S_SSE(xmm8, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_5x2F_DISABLE:
+    ;
+
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm4);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm5);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm6);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm7);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm8);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x2)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_4x2F_DISABLE,
+              &&POST_OPS_BIAS_4x2F,
+              &&POST_OPS_RELU_4x2F,
+              &&POST_OPS_RELU_SCALE_4x2F,
+              &&POST_OPS_GELU_TANH_4x2F,
+              &&POST_OPS_GELU_ERF_4x2F,
+              &&POST_OPS_CLIP_4x2F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = ( __m128 )_mm_load_sd((const double*)bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm7)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_4x2F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm3 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_4x2F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      // c[3,0-3]
+      xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_4x2F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_4x2F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[3,0-3]
+      GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_4x2F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_4x2F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      // c[3,0-3]
+      CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_4x2F_DISABLE:
+    ;
+
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm4);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm5);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm6);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm7);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x2)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_3x2F_DISABLE,
+              &&POST_OPS_BIAS_3x2F,
+              &&POST_OPS_RELU_3x2F,
+              &&POST_OPS_RELU_SCALE_3x2F,
+              &&POST_OPS_GELU_TANH_3x2F,
+              &&POST_OPS_GELU_ERF_3x2F,
+              &&POST_OPS_CLIP_3x2F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = ( __m128 )_mm_load_sd((const double*)bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm6)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_3x2F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_3x2F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_3x2F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_3x2F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_3x2F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_3x2F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_3x2F_DISABLE:
+    ;
+
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm4);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm5);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm6);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x2)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_2x2F_DISABLE,
+              &&POST_OPS_BIAS_2x2F,
+              &&POST_OPS_RELU_2x2F,
+              &&POST_OPS_RELU_SCALE_2x2F,
+              &&POST_OPS_GELU_TANH_2x2F,
+              &&POST_OPS_GELU_ERF_2x2F,
+              &&POST_OPS_CLIP_2x2F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5;
+
+    /* zero the accumulator registers */
+    xmm4 = _mm_setzero_ps();
+    xmm5 = _mm_setzero_ps();
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = ( __m128 )_mm_load_sd((const double*)bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    xmm4 = _mm_mul_ps(xmm4,xmm0);
+    xmm5 = _mm_mul_ps(xmm5,xmm0);
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_2(_cbuf,rs_c,xmm0,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm0,xmm3,xmm5)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_2x2F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_2x2F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_2x2F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_2x2F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_2x2F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_2x2F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_2x2F_DISABLE:
+    ;
+
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm4);
+    cbuf += rs_c;
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm5);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x2)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_1x2F_DISABLE,
+              &&POST_OPS_BIAS_1x2F,
+              &&POST_OPS_RELU_1x2F,
+              &&POST_OPS_RELU_SCALE_1x2F,
+              &&POST_OPS_GELU_TANH_1x2F,
+              &&POST_OPS_GELU_ERF_1x2F,
+              &&POST_OPS_CLIP_1x2F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4;
+    
+    /* zero the accumulator registers */
+    xmm4 = _mm_setzero_ps();
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = ( __m128 )_mm_load_sd((const double*)bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    xmm4 = _mm_mul_ps(xmm4,xmm0);
+
+
+    if ( beta != 0.0 )
+    {
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+        F32_C_BNZ_2(cbuf,rs_c,xmm0,xmm3,xmm4)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_1x2F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_1x2F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_1x2F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_1x2F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_1x2F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_1x2F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_1x2F_DISABLE:
+    ;
+
+    _mm_store_sd((double*)cbuf, ( __m128d )xmm4);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_5x1)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_5x1F_DISABLE,
+              &&POST_OPS_BIAS_5x1F,
+              &&POST_OPS_RELU_5x1F,
+              &&POST_OPS_RELU_SCALE_5x1F,
+              &&POST_OPS_GELU_TANH_5x1F,
+              &&POST_OPS_GELU_ERF_5x1F,
+              &&POST_OPS_CLIP_5x1F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    __m128 xmm8, xmm9;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    ZERO_ACC_XMM_4_REG(xmm8,xmm9,xmm0,xmm1) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_load_ss( bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+        xmm8 = _mm_fmadd_ps(xmm0, xmm2, xmm8);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0) 
+    ALPHA_MUL_ACC_XMM_4_REG(xmm8,xmm9,xmm2,xmm3,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm7)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm8)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_5x1F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm0 );
+
+        // c[4,0-3]
+        xmm8 = _mm_add_ps( xmm8, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm3 );
+
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 4 );
+
+        // c[4,0-3]
+        xmm8 = _mm_add_ps( xmm8, xmm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_5x1F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      // c[3,0-3]
+      xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+      // c[4,0-3]
+      xmm8 = _mm_max_ps( xmm8, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_5x1F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      // c[4,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_5x1F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[3,0-3]
+      GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[4,0-3]
+      GELU_TANH_F32S_SSE(xmm8, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_5x1F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      // c[4,0-3]
+      GELU_ERF_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_5x1F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      // c[3,0-3]
+      CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+      // c[4,0-3]
+      CLIP_F32S_SSE(xmm8, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_5x1F_DISABLE:
+    ;
+
+    _mm_store_ss(cbuf, xmm4);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm5);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm6);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm7);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm8);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_4x1)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_4x1F_DISABLE,
+              &&POST_OPS_BIAS_4x1F,
+              &&POST_OPS_RELU_4x1F,
+              &&POST_OPS_RELU_SCALE_4x1F,
+              &&POST_OPS_GELU_TANH_4x1F,
+              &&POST_OPS_GELU_ERF_4x1F,
+              &&POST_OPS_CLIP_4x1F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_load_ss( bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm7)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_4x1F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+        xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+        // c[3,0-3]
+        xmm7 = _mm_add_ps( xmm7, xmm3 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_4x1F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      // c[3,0-3]
+      xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_4x1F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_4x1F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[3,0-3]
+      GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_4x1F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      // c[3,0-3]
+      GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_4x1F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      // c[3,0-3]
+      CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_4x1F_DISABLE:
+    ;
+
+    _mm_store_ss(cbuf, xmm4);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm5);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm6);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm7);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_3x1)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_3x1F_DISABLE,
+              &&POST_OPS_BIAS_3x1F,
+              &&POST_OPS_RELU_3x1F,
+              &&POST_OPS_RELU_SCALE_3x1F,
+              &&POST_OPS_GELU_TANH_3x1F,
+              &&POST_OPS_GELU_ERF_3x1F,
+              &&POST_OPS_CLIP_3x1F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_load_ss( bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0)
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm6)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_3x1F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+        xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+        // c[2,0-3]
+        xmm6 = _mm_add_ps( xmm6, xmm2 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_3x1F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      // c[2,0-3]
+      xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_3x1F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_3x1F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[2,0-3]
+      GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_3x1F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      // c[2,0-3]
+      GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_3x1F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      // c[2,0-3]
+      CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_3x1F_DISABLE:
+    ;
+
+    _mm_store_ss(cbuf, xmm4);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm5);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm6);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_2x1)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_2x1F_DISABLE,
+              &&POST_OPS_BIAS_2x1F,
+              &&POST_OPS_RELU_2x1F,
+              &&POST_OPS_RELU_SCALE_2x1F,
+              &&POST_OPS_GELU_TANH_2x1F,
+              &&POST_OPS_GELU_ERF_2x1F,
+              &&POST_OPS_CLIP_2x1F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5;
+
+    /* zero the accumulator registers */
+    xmm4 = _mm_setzero_ps();
+    xmm5 = _mm_setzero_ps();
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+    _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_load_ss( bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    xmm4 = _mm_mul_ps(xmm4,xmm0);
+    xmm5 = _mm_mul_ps(xmm5,xmm0);
+
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_1(_cbuf,rs_c,xmm0,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm0,xmm3,xmm5)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_2x1F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+        xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_add_ps( xmm5, xmm1 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_2x1F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      // c[1,0-3]
+      xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_2x1F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_2x1F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      // c[1,0-3]
+      GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_2x1F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      // c[1,0-3]
+      GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_2x1F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      // c[1,0-3]
+      CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_2x1F_DISABLE:
+    ;
+
+    _mm_store_ss(cbuf, xmm4);
+    cbuf += rs_c;
+    _mm_store_ss(cbuf, xmm5);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_1x1)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_1x1F_DISABLE,
+              &&POST_OPS_BIAS_1x1F,
+              &&POST_OPS_RELU_1x1F,
+              &&POST_OPS_RELU_SCALE_1x1F,
+              &&POST_OPS_GELU_TANH_1x1F,
+              &&POST_OPS_GELU_ERF_1x1F,
+              &&POST_OPS_CLIP_1x1F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3, xmm4;
+    
+    /* zero the accumulator registers */
+    xmm4 = _mm_setzero_ps();
+    
+    /*_mm_prefetch( (MR X NR) from C*/
+    _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 16 elements from row0 of B*/
+        xmm0 = _mm_load_ss( bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+    }//kloop
+
+    xmm0 = _mm_broadcast_ss(&(alpha));
+    xmm4 = _mm_mul_ps(xmm4,xmm0);
+
+
+    if ( beta != 0.0 )
+    {
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+        F32_C_BNZ_1(cbuf,rs_c,xmm0,xmm3,xmm4)
+    }//betazero
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_1x1F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 );
+
+        // c[0,0-3]
+        xmm4 = _mm_add_ps( xmm4, xmm0 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_1x1F:
+    {
+      xmm0 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_1x1F:
+    {
+      xmm0 =
+        _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_setzero_ps();
+
+      // c[0,0-3]
+      RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_1x1F:
+    {
+      __m128 dn, x_tanh;
+      __m128i q;
+
+      // c[0,0-3]
+      GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_1x1F:
+    {
+      // c[0,0-3]
+      GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_1x1F:
+    {
+      xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+      
+      // c[0,0-3]
+      CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_1x1F_DISABLE:
+    ;
+
+    _mm_store_ss(cbuf, xmm4);
+}
+#endif
diff --git a/kernels/zen/lpgemm/f32f32f32/lpgemm_kernel_macros_f32_avx2.h b/kernels/zen/lpgemm/f32f32f32/lpgemm_kernel_macros_f32_avx2.h
new file mode 100644
index 0000000000..8fbdd78a8b
--- /dev/null
+++ b/kernels/zen/lpgemm/f32f32f32/lpgemm_kernel_macros_f32_avx2.h
@@ -0,0 +1,131 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+  Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_F32_SGEMM_AVX2_KERN_MACROS_H
+#define LPGEMM_F32_SGEMM_AVX2_KERN_MACROS_H
+
+#include "../gelu_avx2.h"
+#include "../math_utils_avx2.h"
+
+/* ReLU scale (Parametric ReLU):  f(x) = x, when x > 0 and f(x) = a*x when x <= 0 */
+#define RELU_SCALE_OP_F32S_AVX2(reg, scale, zreg, scratch2) \
+     scratch2 = _mm256_min_ps( reg, zreg ); /* <0 elems*/\
+     reg = _mm256_max_ps( reg, zreg ); /* >=0 elems*/\
+     scratch2 = _mm256_mul_ps( scratch2, scale ); /*scale <0 elems*/\
+     reg = _mm256_or_ps( reg, scratch2 ); \
+
+/* ReLU scale (Parametric ReLU):  f(x) = x, when x > 0 and f(x) = a*x when x <= 0 */
+#define RELU_SCALE_OP_F32S_SSE(reg, scale, zreg, scratch2) \
+     scratch2 = _mm_min_ps( reg, zreg ); /* <0 elems*/\
+     reg = _mm_max_ps( reg, zreg ); /* >=0 elems*/\
+     scratch2 = _mm_mul_ps( scratch2, scale ); /*scale <0 elems*/\
+     reg = _mm_or_ps( reg, scratch2 ); \
+
+/* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */
+#define GELU_TANH_F32S_AVX2(reg, r, r2, x, z, dn, x_tanh, q) \
+\
+	GELU_TANH_F32_AVX2_DEF(reg, r, r2, x, z, dn, x_tanh, q); \
+
+/* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */
+#define GELU_TANH_F32S_SSE(reg, r, r2, x, z, dn, x_tanh, q) \
+\
+	GELU_TANH_F32_SSE_DEF(reg, r, r2, x, z, dn, x_tanh, q); \
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_F32S_AVX2(reg, r, x, x_erf) \
+\
+	GELU_ERF_F32_AVX2_DEF(reg, r, x, x_erf); \
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_F32S_SSE(reg, r, x, x_erf) \
+\
+	GELU_ERF_F32_SSE_DEF(reg, r, x, x_erf); \
+
+#define CLIP_F32S_AVX2(reg, min, max) \
+\
+	reg = _mm256_min_ps( _mm256_max_ps( reg, min ), max ); \
+
+#define CLIP_F32S_SSE(reg, min, max) \
+\
+	reg = _mm_min_ps( _mm_max_ps( reg, min ), max ); \
+
+//Zero-out the given YMM accumulator registers
+#define ZERO_ACC_YMM_4_REG(ymm0,ymm1,ymm2,ymm3) \
+      ymm0 = _mm256_setzero_ps(); \
+      ymm1 = _mm256_setzero_ps(); \
+      ymm2 = _mm256_setzero_ps(); \
+      ymm3 = _mm256_setzero_ps();
+
+//Zero-out the given XMM accumulator registers
+#define ZERO_ACC_XMM_4_REG(xmm0,xmm1,xmm2,xmm3) \
+      xmm0 = _mm_setzero_ps(); \
+      xmm1 = _mm_setzero_ps(); \
+      xmm2 = _mm_setzero_ps(); \
+      xmm3 = _mm_setzero_ps();
+ 
+/*Multiply alpha with accumulator registers and store back*/
+#define ALPHA_MUL_ACC_YMM_4_REG(ymm0,ymm1,ymm2,ymm3,alpha) \
+      ymm0 = _mm256_mul_ps(ymm0,alpha); \
+      ymm1 = _mm256_mul_ps(ymm1,alpha); \
+      ymm2 = _mm256_mul_ps(ymm2,alpha); \
+      ymm3 = _mm256_mul_ps(ymm3,alpha);
+
+/*Multiply alpha with accumulator registers and store back*/
+#define ALPHA_MUL_ACC_XMM_4_REG(xmm0,xmm1,xmm2,xmm3,alpha) \
+      xmm0 = _mm_mul_ps(xmm0,alpha); \
+      xmm1 = _mm_mul_ps(xmm1,alpha); \
+      xmm2 = _mm_mul_ps(xmm2,alpha); \
+      xmm3 = _mm_mul_ps(xmm3,alpha);
+ 
+/*Load C, Multiply with beta and add with A*B and store*/
+#define F32_C_BNZ_8(cbuf,rs_c,ymm0,beta,ymm2) \
+      ymm0 = _mm256_load_ps(cbuf); \
+      ymm2 = _mm256_fmadd_ps(ymm0, beta, ymm2); \
+
+/*Load C, Multiply with beta and add with A*B and store*/
+#define F32_C_BNZ_4(cbuf,rs_c,xmm0,beta,xmm2) \
+      xmm0 = _mm_load_ps(cbuf); \
+      xmm2 = _mm_fmadd_ps(xmm0, beta, xmm2); \
+
+/*Load C, Multiply with beta and add with A*B and store*/
+#define F32_C_BNZ_2(cbuf,rs_c,xmm0,beta,xmm2) \
+      xmm0 = ( __m128 )_mm_load_sd((const double*)cbuf); \
+      xmm2 = _mm_fmadd_ps(xmm0, beta, xmm2); \
+
+/*Load C, Multiply with beta and add with A*B and store*/
+#define F32_C_BNZ_1(cbuf,rs_c,xmm0,beta,xmm2) \
+      xmm0 = _mm_load_ss(cbuf); \
+      xmm2 = _mm_fmadd_ps(xmm0, beta, xmm2); \
+
+#endif //LPGEMM_F32_SGEMM_AVX2_KERN_MACROS_H
diff --git a/kernels/zen/lpgemm/f32f32f32/lpgemm_m_kernel_f32_avx2.c b/kernels/zen/lpgemm/f32f32f32/lpgemm_m_kernel_f32_avx2.c
new file mode 100644
index 0000000000..a142a0fb3a
--- /dev/null
+++ b/kernels/zen/lpgemm/f32f32f32/lpgemm_m_kernel_f32_avx2.c
@@ -0,0 +1,1992 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+  Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "immintrin.h"
+#include "xmmintrin.h"
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_kernel_macros_f32_avx2.h"
+
+#define MR 6
+#define NR 16
+
+LPGEMM_MAIN_KERN(float,float,float,f32f32f32of32_6x16m)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_6x16F_DISABLE,
+              &&POST_OPS_BIAS_6x16F,
+              &&POST_OPS_RELU_6x16F,
+              &&POST_OPS_RELU_SCALE_6x16F,
+              &&POST_OPS_GELU_TANH_6x16F,
+              &&POST_OPS_GELU_ERF_6x16F,
+              &&POST_OPS_CLIP_6x16F
+            };
+    uint64_t n_left = n0 % NR;  //n0 is expected to be n0<=NR
+
+    // First check whether this is a edge case in the n dimension.
+    // If so, dispatch other 6x?m kernels, as needed.
+    if (n_left )
+    {
+        float*  cij = (float* )c;
+        float*  bj  = (float* )b;
+        float*  ai  = (float* )a;
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+
+            lpgemm_rowvar_f32f32f32of32_6x8m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 8;
+        }
+  
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+
+            lpgemm_rowvar_f32f32f32of32_6x4m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 4;
+        }
+        
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+  
+            lpgemm_rowvar_f32f32f32of32_6x2m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 2;
+        }
+        
+        if ( 1 == n_left )
+        {    
+            lpgemm_rowvar_f32f32f32of32_6x1m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+        }
+        
+        return;
+    }
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    uint64_t m_iter = (uint64_t)m0 / 6;
+    uint64_t m_left = (uint64_t)m0 % 6;
+
+    if ( m_iter == 0 ){    goto consider_edge_cases; }
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm5, ymm6, ymm7;
+    __m256 ymm8, ymm9, ymm10, ymm11;
+    __m256 ymm12, ymm13, ymm14, ymm15;
+
+    /*Produce MRxNR outputs */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+      /* zero the accumulator registers */
+      ZERO_ACC_YMM_4_REG(ymm4, ymm5, ymm6, ymm7);
+      ZERO_ACC_YMM_4_REG(ymm8,  ymm9,  ymm10, ymm11);
+      ZERO_ACC_YMM_4_REG(ymm12, ymm13, ymm14, ymm15);
+
+      float *abuf, *bbuf, *cbuf, *_cbuf;
+
+      abuf = (float *)a + m * ps_a; // Move to next MRxKC in MCxKC (where MC>=MR)
+      bbuf = (float *)b;  //Same KCxNR panel is used across MCxKC block 
+      cbuf = (float *)c + m * MR * rs_c; // Move to next MRXNR in output
+      
+      /*_mm_prefetch( (MR X NR) from C*/
+      _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 5*rs_c), _MM_HINT_T0);
+
+      for(dim_t k = 0; k < k_iter; k++)
+      {
+        /*Load 16 elements from row0 of B*/
+        ymm0 = _mm256_loadu_ps(bbuf );
+        ymm1 = _mm256_loadu_ps(bbuf + 8);
+        bbuf += rs_b;  //move b pointer to next row
+
+        ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1  
+
+        ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+        ymm5 = _mm256_fmadd_ps(ymm1, ymm2, ymm5);
+        ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+        ymm7 = _mm256_fmadd_ps(ymm1, ymm3, ymm7);
+
+        ymm2 = _mm256_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r2 
+        ymm3 = _mm256_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r3
+
+        ymm8 = _mm256_fmadd_ps(ymm0, ymm2, ymm8);
+        ymm9 = _mm256_fmadd_ps(ymm1, ymm2, ymm9);
+        ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10);
+        ymm11 = _mm256_fmadd_ps(ymm1, ymm3, ymm11);
+
+        ymm2 = _mm256_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r4
+        ymm3 = _mm256_broadcast_ss((abuf + 5*rs_a)); //broadcast c0r5        
+        abuf += cs_a;  //move a pointer to next col
+        
+        ymm12 = _mm256_fmadd_ps(ymm0, ymm2, ymm12);
+        ymm13 = _mm256_fmadd_ps(ymm1, ymm2, ymm13);
+        ymm14 = _mm256_fmadd_ps(ymm0, ymm3, ymm14);
+        ymm15 = _mm256_fmadd_ps(ymm1, ymm3, ymm15);    
+      }//kloop
+
+      ymm0 = _mm256_broadcast_ss(&(alpha));
+      ALPHA_MUL_ACC_YMM_4_REG(ymm4,ymm5,ymm6,ymm7,ymm0)
+      ALPHA_MUL_ACC_YMM_4_REG(ymm8,ymm9,ymm10,ymm11,ymm0)
+      ALPHA_MUL_ACC_YMM_4_REG(ymm12,ymm13,ymm14,ymm15,ymm0)
+
+      if ( beta != 0.0 )
+      {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        ymm3 = _mm256_broadcast_ss(&(beta));
+
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+        F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+        F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm7)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm8)
+        F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm9)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm10)
+        F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm11)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm12)
+        F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm13)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm14)
+        F32_C_BNZ_8(_cbuf+8,rs_c,ymm1,ymm3,ymm15)
+      }
+
+      // Post Ops
+      lpgemm_post_op* post_ops_list_temp = post_ops_list;
+      POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_6x16F:
+      {
+        if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+             ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+        {
+          ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+          ymm1 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 1 * 8 ) );
+
+          // c[0,0-7]
+          ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+          // c[0,8-15]
+          ymm5 = _mm256_add_ps( ymm5, ymm1 );
+
+          // c[1,0-7]
+          ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+          // c[1,8-15]
+          ymm7 = _mm256_add_ps( ymm7, ymm1 );
+
+          // c[2,0-7]
+          ymm8 = _mm256_add_ps( ymm8, ymm0 );
+
+          // c[2,8-15]
+          ymm9 = _mm256_add_ps( ymm9, ymm1 );
+
+          // c[3,0-7]
+          ymm10 = _mm256_add_ps( ymm10, ymm0 );
+
+          // c[3,8-15]
+          ymm11 = _mm256_add_ps( ymm11, ymm1 );
+
+          // c[4,0-7]
+          ymm12 = _mm256_add_ps( ymm12, ymm0 );
+
+          // c[4,8-15]
+          ymm13 = _mm256_add_ps( ymm13, ymm1 );
+
+          // c[5,0-7]
+          ymm14 = _mm256_add_ps( ymm14, ymm0 );
+
+          // c[5,8-15]
+          ymm15 = _mm256_add_ps( ymm15, ymm1 );
+        }
+        else
+        {
+          // If original output was columns major, then by the time
+          // kernel sees it, the matrix would be accessed as if it were
+          // transposed. Due to this the bias array will be accessed by
+          // the ic index, and each bias element corresponds to an
+          // entire row of the transposed output array, instead of an
+          // entire column.
+          ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 0 );
+          ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 1 );
+          ymm2 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 2 );
+          ymm3 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 3 );
+
+          // c[0,0-7]
+          ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+          // c[0,8-15]
+          ymm5 = _mm256_add_ps( ymm5, ymm0 );
+
+          // c[1,0-7]
+          ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+          // c[1,8-15]
+          ymm7 = _mm256_add_ps( ymm7, ymm1 );
+
+          // c[2,0-7]
+          ymm8 = _mm256_add_ps( ymm8, ymm2 );
+
+          // c[2,8-15]
+          ymm9 = _mm256_add_ps( ymm9, ymm2 );
+
+          // c[3,0-7]
+          ymm10 = _mm256_add_ps( ymm10, ymm3 );
+
+          // c[3,8-15]
+          ymm11 = _mm256_add_ps( ymm11, ymm3 );
+
+          ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 4 );
+          ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 5 );
+
+          // c[4,0-7]
+          ymm12 = _mm256_add_ps( ymm12, ymm0 );
+
+          // c[4,8-15]
+          ymm13 = _mm256_add_ps( ymm13, ymm0 );
+
+          // c[5,0-7]
+          ymm14 = _mm256_add_ps( ymm14, ymm1 );
+
+          // c[5,8-15]
+          ymm15 = _mm256_add_ps( ymm15, ymm1 );
+        }
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_6x16F:
+      {
+        ymm0 = _mm256_setzero_ps();
+
+        // c[0,0-7]
+        ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+        // c[0,8-15]
+        ymm5 = _mm256_max_ps( ymm5, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+        // c[1,8-15]
+        ymm7 = _mm256_max_ps( ymm7, ymm0 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_max_ps( ymm8, ymm0 );
+
+        // c[2,8-15]
+        ymm9 = _mm256_max_ps( ymm9, ymm0 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_max_ps( ymm10, ymm0 );
+
+        // c[3,8-15]
+        ymm11 = _mm256_max_ps( ymm11, ymm0 );
+
+        // c[4,0-7]
+        ymm12 = _mm256_max_ps( ymm12, ymm0 );
+
+        // c[4,8-15]
+        ymm13 = _mm256_max_ps( ymm13, ymm0 );
+
+        // c[5,0-7]
+        ymm14 = _mm256_max_ps( ymm14, ymm0 );
+
+        // c[5,8-15]
+        ymm15 = _mm256_max_ps( ymm15, ymm0 );
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_SCALE_6x16F:
+      {
+        ymm0 =
+          _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+        ymm1 = _mm256_setzero_ps();
+
+        // c[0,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+        // c[0,8-15]
+        RELU_SCALE_OP_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+        // c[1,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+        // c[1,8-15]
+        RELU_SCALE_OP_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+        // c[2,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+        // c[2,8-15]
+        RELU_SCALE_OP_F32S_AVX2(ymm9, ymm0, ymm1, ymm2)
+
+        // c[3,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+        // c[3,8-15]
+        RELU_SCALE_OP_F32S_AVX2(ymm11, ymm0, ymm1, ymm2)
+
+        // c[4,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm12, ymm0, ymm1, ymm2)
+
+        // c[4,8-15]
+        RELU_SCALE_OP_F32S_AVX2(ymm13, ymm0, ymm1, ymm2)
+
+        // c[5,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm14, ymm0, ymm1, ymm2)
+
+        // c[5,8-15]
+        RELU_SCALE_OP_F32S_AVX2(ymm15, ymm0, ymm1, ymm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_TANH_6x16F:
+      {
+        __m256 dn, x_tanh;
+        __m256i q;
+
+        // c[0,0-7]
+        GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[0,8-15]
+        GELU_TANH_F32S_AVX2(ymm5, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[1,0-7]
+        GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[1,8-15]
+        GELU_TANH_F32S_AVX2(ymm7, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[2,0-7]
+        GELU_TANH_F32S_AVX2(ymm8, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[2,8-15]
+        GELU_TANH_F32S_AVX2(ymm9, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[3,0-7]
+        GELU_TANH_F32S_AVX2(ymm10, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[3,8-15]
+        GELU_TANH_F32S_AVX2(ymm11, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[4,0-7]
+        GELU_TANH_F32S_AVX2(ymm12, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[4,8-15]
+        GELU_TANH_F32S_AVX2(ymm13, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[5,0-7]
+        GELU_TANH_F32S_AVX2(ymm14, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[5,8-15]
+        GELU_TANH_F32S_AVX2(ymm15, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_ERF_6x16F:
+      {
+        // c[0,0-7]
+        GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+        // c[0,8-15]
+        GELU_ERF_F32S_AVX2(ymm5, ymm0, ymm1, ymm2)
+
+        // c[1,0-7]
+        GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+        // c[1,8-15]
+        GELU_ERF_F32S_AVX2(ymm7, ymm0, ymm1, ymm2)
+
+        // c[2,0-7]
+        GELU_ERF_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+        // c[2,8-15]
+        GELU_ERF_F32S_AVX2(ymm9, ymm0, ymm1, ymm2)
+
+        // c[3,0-7]
+        GELU_ERF_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+        // c[3,8-15]
+        GELU_ERF_F32S_AVX2(ymm11, ymm0, ymm1, ymm2)
+
+        // c[4,0-7]
+        GELU_ERF_F32S_AVX2(ymm12, ymm0, ymm1, ymm2)
+
+        // c[4,8-15]
+        GELU_ERF_F32S_AVX2(ymm13, ymm0, ymm1, ymm2)
+
+        // c[5,0-7]
+        GELU_ERF_F32S_AVX2(ymm14, ymm0, ymm1, ymm2)
+
+        // c[5,8-15]
+        GELU_ERF_F32S_AVX2(ymm15, ymm0, ymm1, ymm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_CLIP_6x16F:
+      {
+        ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+        ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+        // c[0,0-7]
+        CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+        // c[0,8-15]
+        CLIP_F32S_AVX2(ymm5, ymm0, ymm1)
+
+        // c[1,0-7]
+        CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+        // c[1,8-15]
+        CLIP_F32S_AVX2(ymm7, ymm0, ymm1)
+
+        // c[2,0-7]
+        CLIP_F32S_AVX2(ymm8, ymm0, ymm1)
+
+        // c[2,8-15]
+        CLIP_F32S_AVX2(ymm9, ymm0, ymm1)
+
+        // c[3,0-7]
+        CLIP_F32S_AVX2(ymm10, ymm0, ymm1)
+
+        // c[3,8-15]
+        CLIP_F32S_AVX2(ymm11, ymm0, ymm1)
+
+        // c[4,0-7]
+        CLIP_F32S_AVX2(ymm12, ymm0, ymm1)
+
+        // c[4,8-15]
+        CLIP_F32S_AVX2(ymm13, ymm0, ymm1)
+
+        // c[5,0-7]
+        CLIP_F32S_AVX2(ymm14, ymm0, ymm1)
+
+        // c[5,8-15]
+        CLIP_F32S_AVX2(ymm15, ymm0, ymm1)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_6x16F_DISABLE:
+      ;
+
+      _mm256_storeu_ps(cbuf, ymm4); 
+      _mm256_storeu_ps(cbuf + 8, ymm5);
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm6); 
+      _mm256_storeu_ps(cbuf + 8, ymm7);
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm8); 
+      _mm256_storeu_ps(cbuf + 8, ymm9);
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm10); 
+      _mm256_storeu_ps(cbuf + 8, ymm11);
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm12); 
+      _mm256_storeu_ps(cbuf + 8, ymm13);
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm14); 
+      _mm256_storeu_ps(cbuf + 8, ymm15);
+
+      post_ops_attr.post_op_c_i += MR;
+    }//mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float*  restrict cij = (float *) c + i_edge*rs_c;
+        float*  restrict ai  = (float *) a + m_iter*ps_a;
+        float*  restrict bj  = (float *) b;
+
+        lpgemm_m_fringe_f32_ker_ft ker_fps[6] =
+        {
+          NULL,
+          lpgemm_rowvar_f32f32f32of32_1x16,
+          lpgemm_rowvar_f32f32f32of32_2x16,
+          lpgemm_rowvar_f32f32f32of32_3x16,
+          lpgemm_rowvar_f32f32f32of32_4x16,
+          lpgemm_rowvar_f32f32f32of32_5x16
+        };
+
+        lpgemm_m_fringe_f32_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          k0,
+          ai, rs_a, cs_a,
+          bj, rs_b, cs_b,
+          cij, rs_c,
+          alpha, beta,
+          post_ops_list, post_ops_attr
+        );
+        return;
+    }
+}
+
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_6x8m)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_6x8F_DISABLE,
+              &&POST_OPS_BIAS_6x8F,
+              &&POST_OPS_RELU_6x8F,
+              &&POST_OPS_RELU_SCALE_6x8F,
+              &&POST_OPS_GELU_TANH_6x8F,
+              &&POST_OPS_GELU_ERF_6x8F,
+              &&POST_OPS_CLIP_6x8F
+            };
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    uint64_t m_iter = (uint64_t)m0 / 6;
+    uint64_t m_left = (uint64_t)m0 % 6;
+
+    if ( m_iter == 0 ){    goto consider_edge_cases; }
+
+    /*Declare the registers*/
+    __m256 ymm0, ymm1, ymm2, ymm3;
+    __m256 ymm4, ymm6, ymm8, ymm10;
+    __m256 ymm12, ymm13, ymm14, ymm15;
+    
+    /*Produce MRxNR outputs */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+      /* zero the accumulator registers */
+      ZERO_ACC_YMM_4_REG(ymm4, ymm6, ymm8, ymm10);
+      ZERO_ACC_YMM_4_REG(ymm12, ymm13, ymm14, ymm15);
+      
+      float *abuf, *bbuf, *cbuf, *_cbuf;
+
+      abuf = (float *)a + m * ps_a; // Move to next MRxKC in MCxKC (where MC>=MR)
+      bbuf = (float *)b;  //Same KCxNR panel is used across MCxKC block 
+      cbuf = (float *)c + m * MR * rs_c; // Move to next MRXNR in output
+      
+      /*_mm_prefetch( (MR X NR) from C*/
+      _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 5*rs_c), _MM_HINT_T0);
+
+      for(dim_t k = 0; k < k_iter; k++)
+      {
+        /*Load 8 elements from row0 of B*/
+        ymm0 = _mm256_loadu_ps(bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        ymm2 = _mm256_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        ymm3 = _mm256_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r1  
+
+        ymm4 = _mm256_fmadd_ps(ymm0, ymm2, ymm4);
+        ymm6 = _mm256_fmadd_ps(ymm0, ymm3, ymm6);
+
+        ymm2 = _mm256_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r2 
+        ymm3 = _mm256_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r3
+
+        ymm8 = _mm256_fmadd_ps(ymm0, ymm2, ymm8);
+        ymm10 = _mm256_fmadd_ps(ymm0, ymm3, ymm10);
+
+        ymm2 = _mm256_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r4
+        ymm3 = _mm256_broadcast_ss((abuf + 5*rs_a)); //broadcast c0r5        
+        abuf += cs_a;  //move a pointer to next col
+        
+        ymm12 = _mm256_fmadd_ps(ymm0, ymm2, ymm12);
+        ymm14 = _mm256_fmadd_ps(ymm0, ymm3, ymm14);
+      }//kloop
+
+      ymm0 = _mm256_broadcast_ss(&(alpha));
+      ALPHA_MUL_ACC_YMM_4_REG(ymm4,ymm6,ymm8,ymm10,ymm0)
+      ALPHA_MUL_ACC_YMM_4_REG(ymm12,ymm13,ymm14,ymm15,ymm0)
+
+      if ( beta != 0.0 )
+      {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        ymm3 = _mm256_broadcast_ss(&(beta));
+
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm8)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm10)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm12)
+        _cbuf += rs_c;
+        F32_C_BNZ_8(_cbuf,rs_c,ymm0,ymm3,ymm14)
+      }
+
+      // Post Ops
+      lpgemm_post_op* post_ops_list_temp = post_ops_list;
+      POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_6x8F:
+      {
+        if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+             ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+        {
+          ymm0 = _mm256_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+          // c[0,0-7]
+          ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+          // c[1,0-7]
+          ymm6 = _mm256_add_ps( ymm6, ymm0 );
+
+          // c[2,0-7]
+          ymm8 = _mm256_add_ps( ymm8, ymm0 );
+
+          // c[3,0-7]
+          ymm10 = _mm256_add_ps( ymm10, ymm0 );
+
+          // c[4,0-7]
+          ymm12 = _mm256_add_ps( ymm12, ymm0 );
+
+          // c[5,0-7]
+          ymm14 = _mm256_add_ps( ymm14, ymm0 );
+        }
+        else
+        {
+          // If original output was columns major, then by the time
+          // kernel sees it, the matrix would be accessed as if it were
+          // transposed. Due to this the bias array will be accessed by
+          // the ic index, and each bias element corresponds to an
+          // entire row of the transposed output array, instead of an
+          // entire column.
+          ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 0 );
+          ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 1 );
+          ymm2 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 2 );
+          ymm3 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 3 );
+
+          // c[0,0-7]
+          ymm4 = _mm256_add_ps( ymm4, ymm0 );
+
+          // c[1,0-7]
+          ymm6 = _mm256_add_ps( ymm6, ymm1 );
+
+          // c[2,0-7]
+          ymm8 = _mm256_add_ps( ymm8, ymm2 );
+
+          // c[3,0-7]
+          ymm10 = _mm256_add_ps( ymm10, ymm3 );
+
+          ymm0 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 4 );
+          ymm1 = _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 5 );
+
+          // c[4,0-7]
+          ymm12 = _mm256_add_ps( ymm12, ymm0 );
+
+          // c[5,0-7]
+          ymm14 = _mm256_add_ps( ymm14, ymm1 );
+        }
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_6x8F:
+      {
+        ymm0 = _mm256_setzero_ps();
+
+        // c[0,0-7]
+        ymm4 = _mm256_max_ps( ymm4, ymm0 );
+
+        // c[1,0-7]
+        ymm6 = _mm256_max_ps( ymm6, ymm0 );
+
+        // c[2,0-7]
+        ymm8 = _mm256_max_ps( ymm8, ymm0 );
+
+        // c[3,0-7]
+        ymm10 = _mm256_max_ps( ymm10, ymm0 );
+
+        // c[4,0-7]
+        ymm12 = _mm256_max_ps( ymm12, ymm0 );
+
+        // c[5,0-7]
+        ymm14 = _mm256_max_ps( ymm14, ymm0 );
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_SCALE_6x8F:
+      {
+        ymm0 =
+          _mm256_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+        ymm1 = _mm256_setzero_ps();
+
+        // c[0,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+        // c[1,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+        // c[2,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+        // c[3,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+        // c[4,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm12, ymm0, ymm1, ymm2)
+
+        // c[5,0-7]
+        RELU_SCALE_OP_F32S_AVX2(ymm14, ymm0, ymm1, ymm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_TANH_6x8F:
+      {
+        __m256 dn, x_tanh;
+        __m256i q;
+
+        // c[0,0-7]
+        GELU_TANH_F32S_AVX2(ymm4, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[1,0-7]
+        GELU_TANH_F32S_AVX2(ymm6, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[2,0-7]
+        GELU_TANH_F32S_AVX2(ymm8, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[3,0-7]
+        GELU_TANH_F32S_AVX2(ymm10, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[4,0-7]
+        GELU_TANH_F32S_AVX2(ymm12, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        // c[5,0-7]
+        GELU_TANH_F32S_AVX2(ymm14, ymm0, ymm1, ymm2, ymm3, dn, x_tanh, q)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_ERF_6x8F:
+      {
+        // c[0,0-7]
+        GELU_ERF_F32S_AVX2(ymm4, ymm0, ymm1, ymm2)
+
+        // c[1,0-7]
+        GELU_ERF_F32S_AVX2(ymm6, ymm0, ymm1, ymm2)
+
+        // c[2,0-7]
+        GELU_ERF_F32S_AVX2(ymm8, ymm0, ymm1, ymm2)
+
+        // c[3,0-7]
+        GELU_ERF_F32S_AVX2(ymm10, ymm0, ymm1, ymm2)
+
+        // c[4,0-7]
+        GELU_ERF_F32S_AVX2(ymm12, ymm0, ymm1, ymm2)
+
+        // c[5,0-7]
+        GELU_ERF_F32S_AVX2(ymm14, ymm0, ymm1, ymm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_CLIP_6x8F:
+      {
+        ymm0 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+        ymm1 = _mm256_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+        // c[0,0-7]
+        CLIP_F32S_AVX2(ymm4, ymm0, ymm1)
+
+        // c[1,0-7]
+        CLIP_F32S_AVX2(ymm6, ymm0, ymm1)
+
+        // c[2,0-7]
+        CLIP_F32S_AVX2(ymm8, ymm0, ymm1)
+
+        // c[3,0-7]
+        CLIP_F32S_AVX2(ymm10, ymm0, ymm1)
+
+        // c[4,0-7]
+        CLIP_F32S_AVX2(ymm12, ymm0, ymm1)
+
+        // c[5,0-7]
+        CLIP_F32S_AVX2(ymm14, ymm0, ymm1)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_6x8F_DISABLE:
+      ;
+
+      _mm256_storeu_ps(cbuf, ymm4); 
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm6); 
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm8); 
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm10); 
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm12); 
+      cbuf += rs_c;
+      _mm256_storeu_ps(cbuf, ymm14); 
+
+      post_ops_attr.post_op_c_i += MR;
+    }//mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float*  restrict cij = (float *) c + i_edge*rs_c;
+        float*  restrict ai  = (float *) a + m_iter*ps_a;
+        float*  restrict bj  = (float *) b;
+
+        lpgemm_m_fringe_f32_ker_ft ker_fps[6] =
+        {
+          NULL,
+          lpgemm_rowvar_f32f32f32of32_1x8,
+          lpgemm_rowvar_f32f32f32of32_2x8,
+          lpgemm_rowvar_f32f32f32of32_3x8,
+          lpgemm_rowvar_f32f32f32of32_4x8,
+          lpgemm_rowvar_f32f32f32of32_5x8
+        };
+
+        lpgemm_m_fringe_f32_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          k0,
+          ai, rs_a, cs_a,
+          bj, rs_b, cs_b,
+          cij,rs_c,
+          alpha, beta,
+          post_ops_list, post_ops_attr
+        );
+        return;
+    }
+}
+
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_6x4m)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_6x4F_DISABLE,
+              &&POST_OPS_BIAS_6x4F,
+              &&POST_OPS_RELU_6x4F,
+              &&POST_OPS_RELU_SCALE_6x4F,
+              &&POST_OPS_GELU_TANH_6x4F,
+              &&POST_OPS_GELU_ERF_6x4F,
+              &&POST_OPS_CLIP_6x4F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    uint64_t m_iter = (uint64_t)m0 / 6;
+    uint64_t m_left = (uint64_t)m0 % 6;
+
+    if ( m_iter == 0 ){    goto consider_edge_cases; }
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    __m128 xmm8, xmm9;
+    
+    /*Produce MRxNR outputs */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+      /* zero the accumulator registers */
+      ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+      ZERO_ACC_XMM_4_REG(xmm8,xmm9,xmm0,xmm1) 
+      
+      float *abuf, *bbuf, *cbuf, *_cbuf;
+
+      abuf = (float *)a + m * ps_a; // Move to next MRxKC in MCxKC (where MC>=MR)
+      bbuf = (float *)b;  //Same KCxNR panel is used across MCxKC block 
+      cbuf = (float *)c + m * MR * rs_c; // Move to next MRXNR in output
+      
+      /*_mm_prefetch( (MR X NR) from C*/
+      _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 5*rs_c), _MM_HINT_T0);
+
+      for(dim_t k = 0; k < k_iter; k++)
+      {
+        /*Load 4 elements from row0 of B*/
+        xmm0 = _mm_loadu_ps(bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 5*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+        xmm8 = _mm_fmadd_ps(xmm0, xmm2, xmm8);
+        xmm9 = _mm_fmadd_ps(xmm0, xmm3, xmm9);
+      }//kloop
+
+      xmm0 = _mm_broadcast_ss(&(alpha));
+      ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0) 
+      ALPHA_MUL_ACC_XMM_4_REG(xmm8,xmm9,xmm2,xmm3,xmm0)
+
+      if ( beta != 0.0 )
+      {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm7)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm8)
+        _cbuf += rs_c;
+        F32_C_BNZ_4(_cbuf,rs_c,xmm1,xmm3,xmm9)
+      }
+
+      // Post Ops
+      lpgemm_post_op* post_ops_list_temp = post_ops_list;
+      POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_6x4F:
+      {
+        if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+             ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+        {
+          xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+          // c[0,0-3]
+          xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+          // c[1,0-3]
+          xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+          // c[2,0-3]
+          xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+          // c[3,0-3]
+          xmm7 = _mm_add_ps( xmm7, xmm0 );
+
+          // c[4,0-3]
+          xmm8 = _mm_add_ps( xmm8, xmm0 );
+
+          // c[5,0-3]
+          xmm9 = _mm_add_ps( xmm9, xmm0 );
+        }
+        else
+        {
+          // If original output was columns major, then by the time
+          // kernel sees it, the matrix would be accessed as if it were
+          // transposed. Due to this the bias array will be accessed by
+          // the ic index, and each bias element corresponds to an
+          // entire row of the transposed output array, instead of an
+          // entire column.
+          xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 0 );
+          xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 1 );
+          xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 2 );
+          xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 3 );
+
+          // c[0,0-3]
+          xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+          // c[1,0-3]
+          xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+          // c[2,0-3]
+          xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+          // c[3,0-3]
+          xmm7 = _mm_add_ps( xmm7, xmm3 );
+
+          xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 4 );
+          xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 5 );
+
+          // c[4,0-3]
+          xmm8 = _mm_add_ps( xmm8, xmm0 );
+
+          // c[5,0-3]
+          xmm9 = _mm_add_ps( xmm9, xmm1 );
+        }
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_6x4F:
+      {
+        xmm0 = _mm_setzero_ps();
+
+        // c[0,0-3]
+        xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+        // c[4,0-3]
+        xmm8 = _mm_max_ps( xmm8, xmm0 );
+
+        // c[5,0-3]
+        xmm9 = _mm_max_ps( xmm9, xmm0 );
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_SCALE_6x4F:
+      {
+        xmm0 =
+          _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+        xmm1 = _mm_setzero_ps();
+
+        // c[0,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+        // c[1,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+        // c[2,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+        // c[3,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+        // c[4,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+        // c[5,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm9, xmm0, xmm1, xmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_TANH_6x4F:
+      {
+        __m128 dn, x_tanh;
+        __m128i q;
+
+        // c[0,0-3]
+        GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[1,0-3]
+        GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[2,0-3]
+        GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[3,0-3]
+        GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[4,0-3]
+        GELU_TANH_F32S_SSE(xmm8, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[5,0-3]
+        GELU_TANH_F32S_SSE(xmm9, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_ERF_6x4F:
+      {
+        // c[0,0-3]
+        GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+        // c[1,0-3]
+        GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+        // c[2,0-3]
+        GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+        // c[3,0-3]
+        GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+        // c[4,0-3]
+        GELU_ERF_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+        // c[5,0-3]
+        GELU_ERF_F32S_SSE(xmm9, xmm0, xmm1, xmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_CLIP_6x4F:
+      {
+        xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+        xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+        // c[0,0-3]
+        CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+        // c[1,0-3]
+        CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+        // c[2,0-3]
+        CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+        // c[3,0-3]
+        CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+        // c[4,0-3]
+        CLIP_F32S_SSE(xmm8, xmm0, xmm1)
+
+        // c[5,0-3]
+        CLIP_F32S_SSE(xmm9, xmm0, xmm1)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_6x4F_DISABLE:
+      ;
+
+      _mm_storeu_ps(cbuf, xmm4);
+      cbuf += rs_c;
+      _mm_storeu_ps(cbuf, xmm5);
+      cbuf += rs_c;
+      _mm_storeu_ps(cbuf, xmm6);
+      cbuf += rs_c;
+      _mm_storeu_ps(cbuf, xmm7);
+      cbuf += rs_c;
+      _mm_storeu_ps(cbuf, xmm8);
+      cbuf += rs_c;
+      _mm_storeu_ps(cbuf, xmm9);
+
+      post_ops_attr.post_op_c_i += MR;
+    }//mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float*  restrict cij = (float *) c + i_edge*rs_c;
+        float*  restrict ai  = (float *) a + m_iter*ps_a;
+        float*  restrict bj  = (float *) b;
+
+        lpgemm_m_fringe_f32_ker_ft ker_fps[6] =
+        {
+          NULL,
+          lpgemm_rowvar_f32f32f32of32_1x4,
+          lpgemm_rowvar_f32f32f32of32_2x4,
+          lpgemm_rowvar_f32f32f32of32_3x4,
+          lpgemm_rowvar_f32f32f32of32_4x4,
+          lpgemm_rowvar_f32f32f32of32_5x4
+        };
+
+        lpgemm_m_fringe_f32_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          k0,
+          ai, rs_a, cs_a,
+          bj, rs_b, cs_b,
+          cij, rs_c,
+          alpha, beta,
+          post_ops_list, post_ops_attr
+        );
+        return;
+    }
+}
+
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_6x2m)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_6x2F_DISABLE,
+              &&POST_OPS_BIAS_6x2F,
+              &&POST_OPS_RELU_6x2F,
+              &&POST_OPS_RELU_SCALE_6x2F,
+              &&POST_OPS_GELU_TANH_6x2F,
+              &&POST_OPS_GELU_ERF_6x2F,
+              &&POST_OPS_CLIP_6x2F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    uint64_t m_iter = (uint64_t)m0 / 6;
+    uint64_t m_left = (uint64_t)m0 % 6;
+
+    if ( m_iter == 0 ){    goto consider_edge_cases; }
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    __m128 xmm8, xmm9;
+    
+    /*Produce MRxNR outputs */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+      /* zero the accumulator registers */
+      ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+      ZERO_ACC_XMM_4_REG(xmm8,xmm9,xmm0,xmm1) 
+      
+      float *abuf, *bbuf, *cbuf, *_cbuf;
+
+      abuf = (float *)a + m * ps_a; // Move to next MRxKC in MCxKC (where MC>=MR)
+      bbuf = (float *)b;  //Same KCxNR panel is used across MCxKC block 
+      cbuf = (float *)c + m * MR * rs_c; // Move to next MRXNR in output
+      
+      /*_mm_prefetch( (MR X NR) from C*/
+      _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 5*rs_c), _MM_HINT_T0);
+
+      for(dim_t k = 0; k < k_iter; k++)
+      {
+        /*Load 2 elements from row0 of B*/
+        xmm0 = ( __m128 )_mm_load_sd((const double*) bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 5*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+        xmm8 = _mm_fmadd_ps(xmm0, xmm2, xmm8);
+        xmm9 = _mm_fmadd_ps(xmm0, xmm3, xmm9);
+      }//kloop
+
+      xmm0 = _mm_broadcast_ss(&(alpha));
+      ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0) 
+      ALPHA_MUL_ACC_XMM_4_REG(xmm8,xmm9,xmm2,xmm3,xmm0)
+
+      if ( beta != 0.0 )
+      {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm7)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm8)
+        _cbuf += rs_c;
+        F32_C_BNZ_2(_cbuf,rs_c,xmm1,xmm3,xmm9)
+      }
+
+      // Post Ops
+      lpgemm_post_op* post_ops_list_temp = post_ops_list;
+      POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_6x2F:
+      {
+        if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+             ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+        {
+          xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+          // c[0,0-3]
+          xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+          // c[1,0-3]
+          xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+          // c[2,0-3]
+          xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+          // c[3,0-3]
+          xmm7 = _mm_add_ps( xmm7, xmm0 );
+
+          // c[4,0-3]
+          xmm8 = _mm_add_ps( xmm8, xmm0 );
+
+          // c[5,0-3]
+          xmm9 = _mm_add_ps( xmm9, xmm0 );
+        }
+        else
+        {
+          // If original output was columns major, then by the time
+          // kernel sees it, the matrix would be accessed as if it were
+          // transposed. Due to this the bias array will be accessed by
+          // the ic index, and each bias element corresponds to an
+          // entire row of the transposed output array, instead of an
+          // entire column.
+          xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 0 );
+          xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 1 );
+          xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 2 );
+          xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 3 );
+
+          // c[0,0-3]
+          xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+          // c[1,0-3]
+          xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+          // c[2,0-3]
+          xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+          // c[3,0-3]
+          xmm7 = _mm_add_ps( xmm7, xmm3 );
+
+          xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 4 );
+          xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 5 );
+
+          // c[4,0-3]
+          xmm8 = _mm_add_ps( xmm8, xmm0 );
+
+          // c[5,0-3]
+          xmm9 = _mm_add_ps( xmm9, xmm1 );
+        }
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_6x2F:
+      {
+        xmm0 = _mm_setzero_ps();
+
+        // c[0,0-3]
+        xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+        // c[4,0-3]
+        xmm8 = _mm_max_ps( xmm8, xmm0 );
+
+        // c[5,0-3]
+        xmm9 = _mm_max_ps( xmm9, xmm0 );
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_SCALE_6x2F:
+      {
+        xmm0 =
+          _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+        xmm1 = _mm_setzero_ps();
+
+        // c[0,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+        // c[1,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+        // c[2,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+        // c[3,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+        // c[4,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+        // c[5,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm9, xmm0, xmm1, xmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_TANH_6x2F:
+      {
+        __m128 dn, x_tanh;
+        __m128i q;
+
+        // c[0,0-3]
+        GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[1,0-3]
+        GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[2,0-3]
+        GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[3,0-3]
+        GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[4,0-3]
+        GELU_TANH_F32S_SSE(xmm8, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[5,0-3]
+        GELU_TANH_F32S_SSE(xmm9, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_ERF_6x2F:
+      {
+        // c[0,0-3]
+        GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+        // c[1,0-3]
+        GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+        // c[2,0-3]
+        GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+        // c[3,0-3]
+        GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+        // c[4,0-3]
+        GELU_ERF_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+        // c[5,0-3]
+        GELU_ERF_F32S_SSE(xmm9, xmm0, xmm1, xmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_CLIP_6x2F:
+      {
+        xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+        xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+        // c[0,0-3]
+        CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+        // c[1,0-3]
+        CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+        // c[2,0-3]
+        CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+        // c[3,0-3]
+        CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+        // c[4,0-3]
+        CLIP_F32S_SSE(xmm8, xmm0, xmm1)
+
+        // c[5,0-3]
+        CLIP_F32S_SSE(xmm9, xmm0, xmm1)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_6x2F_DISABLE:
+      ;
+
+      _mm_store_sd((double*)cbuf, ( __m128d )xmm4);
+      cbuf += rs_c;
+      _mm_store_sd((double*)cbuf, ( __m128d )xmm5);
+      cbuf += rs_c;
+      _mm_store_sd((double*)cbuf, ( __m128d )xmm6);
+      cbuf += rs_c;
+      _mm_store_sd((double*)cbuf, ( __m128d )xmm7);
+      cbuf += rs_c;
+      _mm_store_sd((double*)cbuf, ( __m128d )xmm8);
+      cbuf += rs_c;
+      _mm_store_sd((double*)cbuf, ( __m128d )xmm9);
+
+      post_ops_attr.post_op_c_i += MR;
+    }//mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float*  restrict cij = (float *) c + i_edge*rs_c;
+        float*  restrict ai  = (float *) a + m_iter*ps_a;
+        float*  restrict bj  = (float *) b;
+
+        lpgemm_m_fringe_f32_ker_ft ker_fps[6] =
+        {
+          NULL,
+          lpgemm_rowvar_f32f32f32of32_1x2,
+          lpgemm_rowvar_f32f32f32of32_2x2,
+          lpgemm_rowvar_f32f32f32of32_3x2,
+          lpgemm_rowvar_f32f32f32of32_4x2,
+          lpgemm_rowvar_f32f32f32of32_5x2
+        };
+
+        lpgemm_m_fringe_f32_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          k0,
+          ai, rs_a, cs_a,
+          bj, rs_b, cs_b,
+          cij,rs_c,
+          alpha, beta,
+          post_ops_list, post_ops_attr
+        );
+        return;
+    }
+}
+
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_6x1m)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_6x1F_DISABLE,
+              &&POST_OPS_BIAS_6x1F,
+              &&POST_OPS_RELU_6x1F,
+              &&POST_OPS_RELU_SCALE_6x1F,
+              &&POST_OPS_GELU_TANH_6x1F,
+              &&POST_OPS_GELU_ERF_6x1F,
+              &&POST_OPS_CLIP_6x1F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = (uint64_t)k0;
+
+    uint64_t m_iter = (uint64_t)m0 / 6;
+    uint64_t m_left = (uint64_t)m0 % 6;
+
+    if ( m_iter == 0 ){    goto consider_edge_cases; }
+
+    /*Declare the registers*/
+    __m128 xmm0, xmm1, xmm2, xmm3;
+    __m128 xmm4, xmm5, xmm6, xmm7;
+    __m128 xmm8, xmm9;
+    
+    /*Produce MRxNR outputs */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+      /* zero the accumulator registers */
+      ZERO_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7) 
+      ZERO_ACC_XMM_4_REG(xmm8,xmm9,xmm0,xmm1) 
+      
+      float *abuf, *bbuf, *cbuf, *_cbuf;
+
+      abuf = (float *)a + m * ps_a; // Move to next MRxKC in MCxKC (where MC>=MR)
+      bbuf = (float *)b;  //Same KCxNR panel is used across MCxKC block 
+      cbuf = (float *)c + m * MR * rs_c; // Move to next MRXNR in output
+      
+      /*_mm_prefetch( (MR X NR) from C*/
+      _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 5*rs_c), _MM_HINT_T0);
+
+      for(dim_t k = 0; k < k_iter; k++)
+      {
+        /*Load 1 elements from row0 of B*/
+        xmm0 = _mm_load_ss(bbuf );
+        bbuf += rs_b;  //move b pointer to next row
+
+        xmm1 = _mm_broadcast_ss((abuf + 0*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 1*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 2*rs_a)); //broadcast c0r0
+
+        xmm4 = _mm_fmadd_ps(xmm0, xmm1, xmm4);
+        xmm5 = _mm_fmadd_ps(xmm0, xmm2, xmm5);
+        xmm6 = _mm_fmadd_ps(xmm0, xmm3, xmm6);
+
+        xmm1 = _mm_broadcast_ss((abuf + 3*rs_a)); //broadcast c0r0
+        xmm2 = _mm_broadcast_ss((abuf + 4*rs_a)); //broadcast c0r0
+        xmm3 = _mm_broadcast_ss((abuf + 5*rs_a)); //broadcast c0r0
+        abuf += cs_a;  //move a pointer to next col
+
+        xmm7 = _mm_fmadd_ps(xmm0, xmm1, xmm7);
+        xmm8 = _mm_fmadd_ps(xmm0, xmm2, xmm8);
+        xmm9 = _mm_fmadd_ps(xmm0, xmm3, xmm9);
+      }//kloop
+
+      xmm0 = _mm_broadcast_ss(&(alpha));
+      ALPHA_MUL_ACC_XMM_4_REG(xmm4,xmm5,xmm6,xmm7,xmm0) 
+      ALPHA_MUL_ACC_XMM_4_REG(xmm8,xmm9,xmm2,xmm3,xmm0)
+
+      if ( beta != 0.0 )
+      {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        xmm3 = _mm_broadcast_ss(&(beta));
+
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm4)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm5)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm6)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm7)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm8)
+        _cbuf += rs_c;
+        F32_C_BNZ_1(_cbuf,rs_c,xmm1,xmm3,xmm9)
+      }
+
+      // Post Ops
+      lpgemm_post_op* post_ops_list_temp = post_ops_list;
+      POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_6x1F:
+      {
+        if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+             ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+        {
+          xmm0 = _mm_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 0 * 8 ) );
+
+          // c[0,0-3]
+          xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+          // c[1,0-3]
+          xmm5 = _mm_add_ps( xmm5, xmm0 );
+
+          // c[2,0-3]
+          xmm6 = _mm_add_ps( xmm6, xmm0 );
+
+          // c[3,0-3]
+          xmm7 = _mm_add_ps( xmm7, xmm0 );
+
+          // c[4,0-3]
+          xmm8 = _mm_add_ps( xmm8, xmm0 );
+
+          // c[5,0-3]
+          xmm9 = _mm_add_ps( xmm9, xmm0 );
+        }
+        else
+        {
+          // If original output was columns major, then by the time
+          // kernel sees it, the matrix would be accessed as if it were
+          // transposed. Due to this the bias array will be accessed by
+          // the ic index, and each bias element corresponds to an
+          // entire row of the transposed output array, instead of an
+          // entire column.
+          xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 0 );
+          xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 1 );
+          xmm2 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 2 );
+          xmm3 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 3 );
+
+          // c[0,0-3]
+          xmm4 = _mm_add_ps( xmm4, xmm0 );
+
+          // c[1,0-3]
+          xmm5 = _mm_add_ps( xmm5, xmm1 );
+
+          // c[2,0-3]
+          xmm6 = _mm_add_ps( xmm6, xmm2 );
+
+          // c[3,0-3]
+          xmm7 = _mm_add_ps( xmm7, xmm3 );
+
+          xmm0 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 4 );
+          xmm1 = _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 5 );
+
+          // c[4,0-3]
+          xmm8 = _mm_add_ps( xmm8, xmm0 );
+
+          // c[5,0-3]
+          xmm9 = _mm_add_ps( xmm9, xmm1 );
+        }
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_6x1F:
+      {
+        xmm0 = _mm_setzero_ps();
+
+        // c[0,0-3]
+        xmm4 = _mm_max_ps( xmm4, xmm0 );
+
+        // c[1,0-3]
+        xmm5 = _mm_max_ps( xmm5, xmm0 );
+
+        // c[2,0-3]
+        xmm6 = _mm_max_ps( xmm6, xmm0 );
+
+        // c[3,0-3]
+        xmm7 = _mm_max_ps( xmm7, xmm0 );
+
+        // c[4,0-3]
+        xmm8 = _mm_max_ps( xmm8, xmm0 );
+
+        // c[5,0-3]
+        xmm9 = _mm_max_ps( xmm9, xmm0 );
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_SCALE_6x1F:
+      {
+        xmm0 =
+          _mm_broadcast_ss( ( float* )post_ops_list_temp->op_args2 );
+        xmm1 = _mm_setzero_ps();
+
+        // c[0,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+        // c[1,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+        // c[2,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+        // c[3,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+        // c[4,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+        // c[5,0-3]
+        RELU_SCALE_OP_F32S_SSE(xmm9, xmm0, xmm1, xmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_TANH_6x1F:
+      {
+        __m128 dn, x_tanh;
+        __m128i q;
+
+        // c[0,0-3]
+        GELU_TANH_F32S_SSE(xmm4, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[1,0-3]
+        GELU_TANH_F32S_SSE(xmm5, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[2,0-3]
+        GELU_TANH_F32S_SSE(xmm6, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[3,0-3]
+        GELU_TANH_F32S_SSE(xmm7, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[4,0-3]
+        GELU_TANH_F32S_SSE(xmm8, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        // c[5,0-3]
+        GELU_TANH_F32S_SSE(xmm9, xmm0, xmm1, xmm2, xmm3, dn, x_tanh, q)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_ERF_6x1F:
+      {
+        // c[0,0-3]
+        GELU_ERF_F32S_SSE(xmm4, xmm0, xmm1, xmm2)
+
+        // c[1,0-3]
+        GELU_ERF_F32S_SSE(xmm5, xmm0, xmm1, xmm2)
+
+        // c[2,0-3]
+        GELU_ERF_F32S_SSE(xmm6, xmm0, xmm1, xmm2)
+
+        // c[3,0-3]
+        GELU_ERF_F32S_SSE(xmm7, xmm0, xmm1, xmm2)
+
+        // c[4,0-3]
+        GELU_ERF_F32S_SSE(xmm8, xmm0, xmm1, xmm2)
+
+        // c[5,0-3]
+        GELU_ERF_F32S_SSE(xmm9, xmm0, xmm1, xmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_CLIP_6x1F:
+      {
+        xmm0 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+        xmm1 = _mm_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+        // c[0,0-3]
+        CLIP_F32S_SSE(xmm4, xmm0, xmm1)
+
+        // c[1,0-3]
+        CLIP_F32S_SSE(xmm5, xmm0, xmm1)
+
+        // c[2,0-3]
+        CLIP_F32S_SSE(xmm6, xmm0, xmm1)
+
+        // c[3,0-3]
+        CLIP_F32S_SSE(xmm7, xmm0, xmm1)
+
+        // c[4,0-3]
+        CLIP_F32S_SSE(xmm8, xmm0, xmm1)
+
+        // c[5,0-3]
+        CLIP_F32S_SSE(xmm9, xmm0, xmm1)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_6x1F_DISABLE:
+      ;
+
+      _mm_store_ss(cbuf, xmm4);
+      cbuf += rs_c;
+      _mm_store_ss(cbuf, xmm5);
+      cbuf += rs_c;
+      _mm_store_ss(cbuf, xmm6);
+      cbuf += rs_c;
+      _mm_store_ss(cbuf, xmm7);
+      cbuf += rs_c;
+      _mm_store_ss(cbuf, xmm8);
+      cbuf += rs_c;
+      _mm_store_ss(cbuf, xmm9);
+
+      post_ops_attr.post_op_c_i += MR;
+    }//mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float*  restrict cij = (float *) c + i_edge*rs_c;
+        float*  restrict ai  = (float *) a + m_iter*ps_a;
+        float*  restrict bj  = (float *) b;
+
+        lpgemm_m_fringe_f32_ker_ft ker_fps[6] =
+        {
+          NULL,
+          lpgemm_rowvar_f32f32f32of32_1x1,
+          lpgemm_rowvar_f32f32f32of32_2x1,
+          lpgemm_rowvar_f32f32f32of32_3x1,
+          lpgemm_rowvar_f32f32f32of32_4x1,
+          lpgemm_rowvar_f32f32f32of32_5x1
+        };
+
+        lpgemm_m_fringe_f32_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          k0,
+          ai, rs_a, cs_a,
+          bj, rs_b, cs_b,
+          cij,rs_c,
+          alpha, beta,
+          post_ops_list, post_ops_attr
+        );
+        return;
+    }
+}
+#endif
diff --git a/kernels/zen/lpgemm/gelu_avx2.h b/kernels/zen/lpgemm/gelu_avx2.h
new file mode 100644
index 0000000000..3ee074e917
--- /dev/null
+++ b/kernels/zen/lpgemm/gelu_avx2.h
@@ -0,0 +1,91 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef AOCL_LPGEMM_GELU_DEF_AVX2_H
+#define AOCL_LPGEMM_GELU_DEF_AVX2_H
+
+/* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */
+#define GELU_TANH_F32_AVX2_DEF(reg, r, r2, x, z, dn, x_tanh, q) \
+\
+	r2 = _mm256_mul_ps (reg, reg); \
+	r2 = _mm256_mul_ps (r2, reg); \
+	x_tanh = _mm256_fmadd_ps (_mm256_set1_ps (0.044715), r2, reg); \
+	x_tanh = _mm256_mul_ps (x_tanh, _mm256_set1_ps (0.797884)); \
+\
+	/*x_tanh = tanhf(x_tanh) */  \
+	TANHF_AVX2(x_tanh, r, r2, x, z, dn, q); \
+\
+	x_tanh = _mm256_add_ps (x_tanh, _mm256_set1_ps (1)); \
+	x_tanh = _mm256_mul_ps (x_tanh, reg); \
+	reg = _mm256_mul_ps (x_tanh, _mm256_set1_ps (0.5));
+
+/* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */
+#define GELU_TANH_F32_SSE_DEF(reg, r, r2, x, z, dn, x_tanh, q) \
+\
+	r2 = _mm_mul_ps (reg, reg); \
+	r2 = _mm_mul_ps (r2, reg); \
+	x_tanh = _mm_fmadd_ps (_mm_set1_ps (0.044715), r2, reg); \
+	x_tanh = _mm_mul_ps (x_tanh, _mm_set1_ps (0.797884)); \
+\
+	/*x_tanh = tanhf(x_tanh) */  \
+	TANHF_SSE(x_tanh, r, r2, x, z, dn, q); \
+\
+	x_tanh = _mm_add_ps (x_tanh, _mm_set1_ps (1)); \
+	x_tanh = _mm_mul_ps (x_tanh, reg); \
+	reg = _mm_mul_ps (x_tanh, _mm_set1_ps (0.5));
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_F32_AVX2_DEF(reg, r, x, x_erf) \
+\
+  x_erf = _mm256_mul_ps (reg, _mm256_set1_ps (0.707107)); \
+\
+  /*x_erf = erf(x_erf) */  \
+  ERF_AVX2(x_erf, r, x); \
+\
+  x_erf = _mm256_add_ps (x_erf, _mm256_set1_ps (1)); \
+  x_erf = _mm256_mul_ps (x_erf, reg); \
+  reg = _mm256_mul_ps (x_erf, _mm256_set1_ps (0.5));
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_F32_SSE_DEF(reg, r, x, x_erf) \
+\
+  x_erf = _mm_mul_ps (reg, _mm_set1_ps (0.707107)); \
+\
+  /*x_erf = erf(x_erf) */  \
+  ERF_SSE(x_erf, r, x); \
+\
+  x_erf = _mm_add_ps (x_erf, _mm_set1_ps (1)); \
+  x_erf = _mm_mul_ps (x_erf, reg); \
+  reg = _mm_mul_ps (x_erf, _mm_set1_ps (0.5));
+
+#endif // AOCL_LPGEMM_GELU_DEF_AVX2_H
diff --git a/kernels/zen/lpgemm/lpgemm_util_l1_ops_avx2.c b/kernels/zen/lpgemm/lpgemm_util_l1_ops_avx2.c
new file mode 100644
index 0000000000..2e9a1b5deb
--- /dev/null
+++ b/kernels/zen/lpgemm/lpgemm_util_l1_ops_avx2.c
@@ -0,0 +1,371 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "math_utils_avx2.h"
+#include "gelu_avx2.h"
+
+// TANH GeLU (x) = 0.5 * x * ( 1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )
+#define GELU_TANH_NONVEC(in_val) \
+	( in_val ) = 0.5 * ( ( double )( in_val ) ) * \
+	( \
+	  1 + tanhf \
+	  ( \
+	    0.797884 * \
+	    ( \
+	   	  ( double )( in_val ) + \
+		  ( \
+		    0.044715 * \
+		    ( ( double )( in_val ) * ( double )( in_val ) * ( double )( in_val ) ) \
+		  ) \
+	    ) \
+	  ) \
+	); \
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_NONVEC(in_val) \
+	( in_val ) = 0.5 * ( double )( in_val ) * \
+		( 1 + erff( ( double )( in_val ) * 0.707107 ) ); \
+
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_gelu_tanh_avx2)
+{
+	if ( incx == 1 )
+	{
+		// Break the input into avx2 + sse + non-vetorized blocks.
+		dim_t n_part8 = ( n / 8 ) * 8;
+		dim_t n_part8_rem = n - n_part8;
+		dim_t n_part4 = n_part8_rem / 4;
+		dim_t n_part4_rem = n_part8_rem - ( n_part4 * 4 );
+
+		dim_t idx = 0;
+		__m256 ymm0, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
+		__m256i ymm10i;
+		// avx2 block loop.
+		for ( idx = 0; idx < n_part8; idx += 8 )
+		{
+			ymm0 = _mm256_loadu_ps( x + idx );
+
+			GELU_TANH_F32_AVX2_DEF(ymm0, ymm10, ymm11, ymm12, \
+							ymm13, ymm14, ymm15, ymm10i);
+
+			_mm256_storeu_ps( x + idx, ymm0 );
+		}
+
+		// sse remainder block.
+		if ( n_part4 == 1 )
+		{
+			__m128 xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+			__m128i xmm10i;
+
+			xmm0 = _mm_loadu_ps( x + idx );
+
+			GELU_TANH_F32_SSE_DEF(xmm0, xmm10, xmm11, xmm12, \
+							xmm13, xmm14, xmm15, xmm10i);
+
+			_mm_storeu_ps( x + idx, xmm0 );
+
+			idx = idx + 4;
+		}
+		// non vector remainder block.
+		if ( n_part4_rem > 0 )
+		{
+			for ( dim_t rem_idx = 0; rem_idx < n_part4_rem; ++rem_idx )
+			{
+				float temp_val = *( x + idx + rem_idx );
+				*( x + idx + rem_idx ) = GELU_TANH_NONVEC(temp_val);
+			}
+		}
+	}
+	// For non unit increment, use non-vectorized code.
+	else
+	{
+		dim_t n_incx = n * incx;
+		for ( dim_t idx = 0; idx < n_incx; idx += incx )
+		{
+			float temp_val = *( x + idx );
+			*( x + idx ) = GELU_TANH_NONVEC(temp_val);
+		}
+	}
+}
+
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_gelu_erf_avx2)
+{
+	if ( incx == 1 )
+	{
+		// Break the input into avx2 + sse + non-vetorized blocks.
+		dim_t n_part8 = ( n / 8 ) * 8;
+		dim_t n_part8_rem = n - n_part8;
+		dim_t n_part4 = n_part8_rem / 4;
+		dim_t n_part4_rem = n_part8_rem - ( n_part4 * 4 );
+
+		dim_t idx = 0;
+		__m256 ymm0, ymm10, ymm11, ymm12;
+		// avx2 block loop.
+		for ( idx = 0; idx < n_part8; idx += 8 )
+		{
+			ymm0 = _mm256_loadu_ps( x + idx );
+
+			GELU_ERF_F32_AVX2_DEF(ymm0, ymm10, ymm11, ymm12);
+
+			_mm256_storeu_ps( x + idx, ymm0 );
+		}
+
+		// sse remainder block.
+		if ( n_part4 == 1 )
+		{
+			__m128 xmm0, xmm10, xmm11, xmm12;
+
+			xmm0 = _mm_loadu_ps( x + idx );
+
+			GELU_ERF_F32_SSE_DEF(xmm0, xmm10, xmm11, xmm12);
+
+			_mm_storeu_ps( x + idx, xmm0 );
+
+			idx = idx + 4;
+		}
+		// non vector remainder block.
+		if ( n_part4_rem > 0 )
+		{
+			for ( dim_t rem_idx = 0; rem_idx < n_part4_rem; ++rem_idx )
+			{
+				float temp_val = *( x + idx + rem_idx );
+				*( x + idx + rem_idx ) = GELU_ERF_NONVEC(temp_val);
+			}
+		}
+	}
+	// For non unit increment, use non-vectorized code.
+	else
+	{
+		dim_t n_incx = n * incx;
+		for ( dim_t idx = 0; idx < n_incx; idx += incx )
+		{
+			float temp_val = *( x + idx );
+			*( x + idx ) = GELU_ERF_NONVEC(temp_val);
+		}
+	}
+}
+
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_softmax_avx2)
+{
+	if ( incx == 1 )
+	{
+		double exp_sum[2] = { 0.0 };
+
+		// Break the input into avx2 + sse + non-vetorized blocks.
+		dim_t n_part8 = ( n / 8 ) * 8;
+		dim_t n_part8_rem = n - n_part8;
+		dim_t n_part4 = n_part8_rem / 4;
+		dim_t n_part4_rem = n_part8_rem - ( n_part4 * 4 );
+
+		dim_t idx = 0;
+		__m256 ymm0, ymm10, ymm11, ymm12, ymm13, ymm10out;
+		__m256i ymm10outi;
+		__m128 xmm0, xmm1;
+		__m256d ymm0d, ymm1d;
+		__m128d xmm0d, xmm1d;
+
+		// Exp reduction of the array - avx2 block.
+		for ( idx = 0; idx < n_part8; idx += 8 )
+		{
+			ymm0 = _mm256_loadu_ps( x + idx );
+
+			EXPF_AVX2(ymm0, ymm10, ymm11, ymm12, ymm13, ymm10outi); // zmm10out is the output
+			ymm10out = _mm256_castsi256_ps( ymm10outi );
+
+			// Reduction to be done as double data type.
+			xmm0 = _mm256_castps256_ps128( ymm10out );
+			xmm1 = _mm256_extractf128_ps( ymm10out, 0x1 );
+			ymm0d = _mm256_cvtps_pd( xmm0 );
+			ymm1d = _mm256_cvtps_pd( xmm1 );
+			ymm0d = _mm256_add_pd( ymm0d, ymm1d );
+
+			xmm0d = _mm256_castpd256_pd128( ymm0d );
+			xmm1d = _mm256_extractf128_pd( ymm0d, 0x1 );
+			xmm0d = _mm_add_pd( xmm0d, xmm1d );
+
+			xmm1d = _mm_permute_pd( xmm0d, 0x01);
+			xmm0d = _mm_add_pd( xmm0d, xmm1d );
+			exp_sum[1] = _mm_cvtsd_f64( xmm0d );
+			exp_sum[0] += exp_sum[1];
+		}
+		// sse remainder block.
+		if ( n_part4 == 1 )
+		{
+			__m128 xmm10, xmm11, xmm12, xmm10out;
+			__m128i xmm10outi;
+
+			xmm0 = _mm_loadu_ps( x + idx );
+
+			EXPF_SSE(xmm0, xmm1, xmm10, xmm11, xmm12, xmm10outi);
+			xmm10out = _mm_castsi128_ps( xmm10outi );
+
+			xmm0d = _mm_cvtps_pd( xmm10out );
+			xmm1d = _mm_cvtps_pd( _mm_permute_ps( xmm10out, 0x4E ) ); //0 1 2 3 -> 2 3 0 1
+			xmm0d = _mm_add_pd( xmm0d, xmm1d );
+
+			xmm1d = _mm_permute_pd( xmm0d, 0x01);
+			xmm0d = _mm_add_pd( xmm0d, xmm1d );
+			exp_sum[1] = _mm_cvtsd_f64( xmm0d );
+			exp_sum[0] += exp_sum[1];
+
+			idx = idx + 4;
+		}
+		// non vector remainder block.
+		if ( n_part4_rem > 0 )
+		{
+			float temp_fl_buf[4] = { 0.0 };
+			memcpy( temp_fl_buf, x + idx, n_part4_rem * sizeof( float ) );
+
+			__m128 xmm10, xmm11, xmm12, xmm10out;
+			__m128i xmm10outi;
+
+			xmm0 = _mm_loadu_ps( temp_fl_buf );
+
+			EXPF_SSE(xmm0, xmm1, xmm10, xmm11, xmm12, xmm10outi);
+			xmm10out = _mm_castsi128_ps( xmm10outi );
+
+			xmm0d = _mm_cvtps_pd( xmm10out );
+			xmm1d = _mm_cvtps_pd( _mm_permute_ps( xmm10out, 0x4E ) ); //0 1 2 3 -> 2 3 0 1
+			xmm0d = _mm_add_pd( xmm0d, xmm1d );
+
+			xmm1d = _mm_permute_pd( xmm0d, 0x01);
+			xmm0d = _mm_add_pd( xmm0d, xmm1d );
+			exp_sum[1] = _mm_cvtsd_f64( xmm0d );
+			exp_sum[0] += exp_sum[1];
+			// Only n_part_rem4 elems are valid, need to zero out rest.
+			// This is because exp(0)=1;
+			exp_sum[0] -= ( 4 - n_part4_rem );
+		}
+
+		// Broadcast the double exp sum.
+		__m256d exp_red_ymm0;
+		__m128d exp_red_xmm0;
+		exp_sum[1] = exp_sum[0];
+		exp_red_xmm0 = _mm_loadu_pd( exp_sum );
+		exp_red_ymm0 = _mm256_broadcastsd_pd( exp_red_xmm0 );
+
+		// Exp division of the array - avx2 block.
+		for ( idx = 0; idx < n_part8; idx += 8 )
+		{
+			ymm0 = _mm256_loadu_ps( x + idx );
+
+			// Convert to double
+			xmm0 = _mm256_castps256_ps128( ymm0 );
+			xmm1 = _mm256_extractf128_ps( ymm0, 0x1 );
+			ymm0d = _mm256_cvtps_pd( xmm0 );
+			ymm1d = _mm256_cvtps_pd( xmm1 );
+
+			// Divide at double level
+			ymm0d = _mm256_div_pd( ymm0d, exp_red_ymm0 );
+			ymm1d = _mm256_div_pd( ymm1d, exp_red_ymm0 );
+
+			xmm0 = _mm256_cvtpd_ps( ymm0d );
+			xmm1 = _mm256_cvtpd_ps( ymm1d );
+
+			_mm_storeu_ps( x + idx, xmm0 );
+			_mm_storeu_ps( x + idx + 4, xmm1 );
+		}
+		// sse remainder block.
+		if ( n_part4 == 1 )
+		{
+			xmm0 = _mm_loadu_ps( x + idx );
+
+			// Convert to double
+			xmm0d = _mm_cvtps_pd( xmm0 );
+			xmm1d = _mm_cvtps_pd( _mm_permute_ps( xmm0, 0x4E ) ); //0 1 2 3 -> 2 3 0 1
+
+			// Divide at double level
+			xmm0d = _mm_div_pd( xmm0d, exp_red_xmm0 );
+			xmm1d = _mm_div_pd( xmm1d, exp_red_xmm0 );
+
+			xmm0 = _mm_cvtpd_ps( xmm0d );
+			xmm1 = _mm_cvtpd_ps( xmm1d );
+			xmm1 = _mm_permute_ps( xmm1, 0x4E );
+			xmm0 = _mm_blend_ps( xmm0, xmm1, 0xC); // Combine outputs from 2 registers.
+
+			_mm_storeu_ps( x + idx, xmm0 );
+
+			idx = idx + 4;
+		}
+		// non vector remainder block.
+		if ( n_part4_rem > 0 )
+		{
+			float temp_fl_buf[4] = { 0.0 };
+			memcpy( temp_fl_buf, x + idx, n_part4_rem * sizeof( float ) );
+
+			xmm0 = _mm_loadu_ps( temp_fl_buf );
+
+			// Convert to double
+			xmm0d = _mm_cvtps_pd( xmm0 );
+			xmm1d = _mm_cvtps_pd( _mm_permute_ps( xmm0, 0x4E ) ); //0 1 2 3 -> 2 3 0 1
+
+			// Divide at double level
+			xmm0d = _mm_div_pd( xmm0d, exp_red_xmm0 );
+			xmm1d = _mm_div_pd( xmm1d, exp_red_xmm0 );
+
+			xmm0 = _mm_cvtpd_ps( xmm0d );
+			xmm1 = _mm_cvtpd_ps( xmm1d );
+			xmm1 = _mm_permute_ps( xmm1, 0x4E );
+			xmm0 = _mm_blend_ps( xmm0, xmm1, 0xC);
+
+			_mm_storeu_ps( temp_fl_buf, xmm0 );
+			memcpy( x + idx, temp_fl_buf, n_part4_rem * sizeof( float ) );
+		}
+	}
+	// For non unit increment, use non-vectorized code.
+	else
+	{
+		double exp_sum = 0.0;
+
+		dim_t n_incx = n * incx;
+
+		// Exp reduction of the array.
+		for ( dim_t idx = 0; idx < n_incx; idx += incx )
+		{
+			float temp_val = *( x + idx );
+			exp_sum += (double)( expf( temp_val ) );
+		}
+		// Exp division of the array.
+		for ( dim_t idx = 0; idx < n_incx; idx += incx )
+		{
+			float temp_val = *( x + idx );
+			*( x + idx ) = ( float )( ( double ) temp_val / exp_sum );
+		}
+	}
+}
+#endif
diff --git a/kernels/zen/lpgemm/math_utils_avx2.h b/kernels/zen/lpgemm/math_utils_avx2.h
new file mode 100644
index 0000000000..e705adb8f7
--- /dev/null
+++ b/kernels/zen/lpgemm/math_utils_avx2.h
@@ -0,0 +1,166 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef AOCL_LPGEMM_MATH_UTILS_AVX2_H
+#define AOCL_LPGEMM_MATH_UTILS_AVX2_H
+
+//constants for exp function
+#define lpgemm_exp_c0 0x1.0000014439a91p0
+#define lpgemm_exp_c1 0x1.62e43170e3344p-1
+#define lpgemm_exp_c2 0x1.ebf906bc4c115p-3
+#define lpgemm_exp_c3 0x1.c6ae2bb88c0c8p-5
+#define lpgemm_exp_c4 0x1.3d1079db4ef69p-7
+#define lpgemm_exp_c5 0x1.5f8905cb0cc4ep-10
+
+#define TBL_LN2 0x1.71547652b82fep+0
+#define EXPF_HUGE 0x1.8p+23
+#define EXPF_MIN -88.7228393f
+#define EXPF_MAX 88.7228393f
+#define inf 1.0/0.0
+#define sign -2147483648
+
+//constants for erf function
+#define lpgemm_erf_c0 0x1.20dd7890d27e1cec99fce48c29cp0
+#define lpgemm_erf_c1 -0x1.ab4bed70f238422edeeba9c558p-16
+#define lpgemm_erf_c2 -0x1.80a1bd5878e0b0689c5ff4fcdd4p-2
+#define lpgemm_erf_c3 -0x1.07cb4cde6a7d9528c8a732990e4p-8
+#define lpgemm_erf_c4 0x1.092cba598f96f00ddc5854cf7cp-3
+#define lpgemm_erf_c5 -0x1.51f0ce4ac87c55f11f685864714p-5
+#define lpgemm_erf_c6 0x1.4101f320bf8bc4d41c228faaa6cp-5
+#define lpgemm_erf_c7 -0x1.2300882a7d1b712726997de80ep-4
+#define lpgemm_erf_c8 0x1.d45745fff0e4b6d0604a9ab6284p-5
+#define lpgemm_erf_c9 -0x1.9eb1491956e31ded96176d7c8acp-6
+#define lpgemm_erf_c10 0x1.b9183fc75d326b9044bc63c9694p-8
+#define lpgemm_erf_c11 -0x1.10e8f8c89ad8645e7d769cd596cp-10
+#define lpgemm_erf_c12 0x1.224ffc80cc19957a48ecedad6c8p-14
+#define lpgemm_erf_c13 0x1.12a30f42c71308321e7e7cb0174p-18
+#define lpgemm_erf_c14 -0x1.155445e2e006723066d72d22ddcp-20
+#define lpgemm_erf_c15 0x1.c6a4181da4ef76f22bd39bb5dcp-25
+
+//Trignometric EXP, TANH and ERF functions for AVX2
+
+#define POLY_EVAL_6_AVX2(r, r2, z) \
+    r2 = _mm256_mul_ps (r, r); \
+    z = _mm256_fmadd_ps (r2, _mm256_fmadd_ps (r, _mm256_set1_ps(lpgemm_exp_c3), _mm256_set1_ps(lpgemm_exp_c2)), \
+        _mm256_fmadd_ps (r, _mm256_set1_ps(lpgemm_exp_c1), _mm256_set1_ps(lpgemm_exp_c0))); \
+    r2 = _mm256_mul_ps (r2, r2); \
+    r = _mm256_fmadd_ps (r2, _mm256_fmadd_ps (r, _mm256_set1_ps(lpgemm_exp_c5), _mm256_set1_ps(lpgemm_exp_c4)), z); \
+
+#define EXPF_AVX2(x, r, r2, z, dn, q) \
+    z = _mm256_mul_ps (x, _mm256_set1_ps(TBL_LN2));	\
+	  dn = _mm256_add_ps (z , _mm256_set1_ps(EXPF_HUGE));  \
+    r = _mm256_sub_ps (z , _mm256_sub_ps (dn , _mm256_set1_ps(EXPF_HUGE)));  \
+\
+    POLY_EVAL_6_AVX2 (r, r2, z); \
+\
+    q = _mm256_add_epi32((__m256i) (r), _mm256_sllv_epi32 ((__m256i)dn, _mm256_set1_epi32 (23)) ); \
+    q =  (__m256i)_mm256_blendv_ps ((__m256)q, _mm256_set1_ps(inf), _mm256_cmp_ps (_mm256_set1_ps(88.0), x, 1)); \
+    q =  (__m256i)_mm256_blendv_ps ((__m256)q, _mm256_set1_ps(0.0), _mm256_cmp_ps (x, _mm256_set1_ps(-88.0), 1));
+
+#define TANHF_AVX2(x_tanh, r, r2, x, z, dn, q) \
+    x = _mm256_mul_ps (_mm256_andnot_ps(_mm256_set1_ps(-0.0f), x_tanh), _mm256_set1_ps(-2) ); \
+\
+    EXPF_AVX2(x, r, r2, z, dn, q); \
+\
+    z =  _mm256_add_ps ((__m256)q, _mm256_set1_ps(-1)); \
+    z = _mm256_div_ps (z, _mm256_add_ps (z, _mm256_set1_ps(2))); \
+    z = _mm256_mul_ps (z, _mm256_set1_ps(-1)); \
+    x_tanh = (_mm256_xor_ps (_mm256_and_ps (x_tanh, (__m256)(_mm256_set1_epi32(sign))), z)) ;
+
+#define POLY_EVAL_HORNER_16_0_AVX2(r,x) \
+    x = _mm256_mul_ps (_mm256_fmadd_ps ( \
+    _mm256_fmadd_ps(_mm256_fmadd_ps (_mm256_fmadd_ps (_mm256_fmadd_ps (_mm256_fmadd_ps ( _mm256_fmadd_ps ( \
+    _mm256_fmadd_ps (_mm256_fmadd_ps (_mm256_fmadd_ps (_mm256_fmadd_ps (_mm256_fmadd_ps (_mm256_fmadd_ps ( \
+    _mm256_fmadd_ps ( _mm256_fmadd_ps (r, _mm256_set1_ps(lpgemm_erf_c15), _mm256_set1_ps(lpgemm_erf_c14)), r, _mm256_set1_ps(lpgemm_erf_c13)), \
+    r, _mm256_set1_ps(lpgemm_erf_c12)), r,  _mm256_set1_ps(lpgemm_erf_c11)), r, _mm256_set1_ps(lpgemm_erf_c10)), r, _mm256_set1_ps(lpgemm_erf_c9)), \
+    r, _mm256_set1_ps(lpgemm_erf_c8)), r, _mm256_set1_ps(lpgemm_erf_c7)), r, _mm256_set1_ps(lpgemm_erf_c6)), r, _mm256_set1_ps(lpgemm_erf_c5)), r, \
+    _mm256_set1_ps(lpgemm_erf_c4)), r, _mm256_set1_ps(lpgemm_erf_c3)), r, _mm256_set1_ps(lpgemm_erf_c2)), r, _mm256_set1_ps(lpgemm_erf_c1)), r, \
+    _mm256_set1_ps(lpgemm_erf_c0)), r); \
+
+#define ERF_AVX2(x_erf, r, x) \
+    r = _mm256_and_ps (x_erf, (__m256)_mm256_set1_epi32(0x7FFFFFFF)); \
+\
+    POLY_EVAL_HORNER_16_0_AVX2(r,x); \
+\
+    x = _mm256_blendv_ps (x, _mm256_set1_ps(1), _mm256_cmp_ps (_mm256_set1_ps(3.9192059040069580078125f), r, 1)); \
+    x_erf = _mm256_or_ps(_mm256_and_ps (x_erf, (__m256)_mm256_set1_epi32(~(0x7FFFFFFF))), x);
+
+//Trignometric EXP, TANH and ERF functions for SSE
+
+#define POLY_EVAL_6_SSE(r, r2, z) \
+    r2 = _mm_mul_ps (r, r); \
+    z = _mm_fmadd_ps (r2, _mm_fmadd_ps (r, _mm_set1_ps(lpgemm_exp_c3), _mm_set1_ps(lpgemm_exp_c2)), \
+        _mm_fmadd_ps (r, _mm_set1_ps(lpgemm_exp_c1), _mm_set1_ps(lpgemm_exp_c0))); \
+    r2 = _mm_mul_ps (r2, r2); \
+    r = _mm_fmadd_ps (r2, _mm_fmadd_ps (r, _mm_set1_ps(lpgemm_exp_c5), _mm_set1_ps(lpgemm_exp_c4)), z); \
+
+#define EXPF_SSE(x, r, r2, z, dn, q) \
+    z = _mm_mul_ps (x, _mm_set1_ps(TBL_LN2));	\
+	  dn = _mm_add_ps (z , _mm_set1_ps(EXPF_HUGE));  \
+    r = _mm_sub_ps (z , _mm_sub_ps (dn , _mm_set1_ps(EXPF_HUGE)));  \
+\
+    POLY_EVAL_6_SSE (r, r2, z); \
+\
+    q = _mm_add_epi32((__m128i) (r), _mm_sllv_epi32 ((__m128i)dn, _mm_set1_epi32 (23)) ); \
+    q =  (__m128i)_mm_blendv_ps ((__m128)q, _mm_set1_ps(inf), _mm_cmp_ps (_mm_set1_ps(88.0), x, 1)); \
+    q =  (__m128i)_mm_blendv_ps ((__m128)q, _mm_set1_ps(0.0), _mm_cmp_ps (x, _mm_set1_ps(-88.0), 1));
+
+#define TANHF_SSE(x_tanh, r, r2, x, z, dn, q) \
+    x = _mm_mul_ps (_mm_andnot_ps(_mm_set1_ps(-0.0f), x_tanh), _mm_set1_ps(-2) ); \
+\
+    EXPF_SSE(x, r, r2, z, dn, q); \
+\
+    z =  _mm_add_ps ((__m128)q, _mm_set1_ps(-1)); \
+    z = _mm_div_ps (z, _mm_add_ps (z, _mm_set1_ps(2))); \
+    z = _mm_mul_ps (z, _mm_set1_ps(-1)); \
+    x_tanh = (_mm_xor_ps (_mm_and_ps (x_tanh, (__m128)(_mm_set1_epi32(sign))), z)) ;
+
+#define POLY_EVAL_HORNER_16_0_SSE(r,x) \
+    x = _mm_mul_ps (_mm_fmadd_ps ( \
+    _mm_fmadd_ps(_mm_fmadd_ps (_mm_fmadd_ps (_mm_fmadd_ps (_mm_fmadd_ps ( _mm_fmadd_ps ( \
+    _mm_fmadd_ps (_mm_fmadd_ps (_mm_fmadd_ps (_mm_fmadd_ps (_mm_fmadd_ps (_mm_fmadd_ps ( \
+    _mm_fmadd_ps ( _mm_fmadd_ps (r, _mm_set1_ps(lpgemm_erf_c15), _mm_set1_ps(lpgemm_erf_c14)), r, _mm_set1_ps(lpgemm_erf_c13)), \
+    r, _mm_set1_ps(lpgemm_erf_c12)), r,  _mm_set1_ps(lpgemm_erf_c11)), r, _mm_set1_ps(lpgemm_erf_c10)), r, _mm_set1_ps(lpgemm_erf_c9)), \
+    r, _mm_set1_ps(lpgemm_erf_c8)), r, _mm_set1_ps(lpgemm_erf_c7)), r, _mm_set1_ps(lpgemm_erf_c6)), r, _mm_set1_ps(lpgemm_erf_c5)), r, \
+    _mm_set1_ps(lpgemm_erf_c4)), r, _mm_set1_ps(lpgemm_erf_c3)), r, _mm_set1_ps(lpgemm_erf_c2)), r, _mm_set1_ps(lpgemm_erf_c1)), r, \
+    _mm_set1_ps(lpgemm_erf_c0)), r); \
+
+#define ERF_SSE(x_erf, r, x) \
+    r = _mm_and_ps (x_erf, (__m128)_mm_set1_epi32(0x7FFFFFFF)); \
+\
+    POLY_EVAL_HORNER_16_0_SSE(r,x); \
+\
+    x = _mm_blendv_ps (x, _mm_set1_ps(1), _mm_cmp_ps (_mm_set1_ps(3.9192059040069580078125f), r, 1)); \
+    x_erf = _mm_or_ps(_mm_and_ps (x_erf, (__m128)_mm_set1_epi32(~(0x7FFFFFFF))), x);
+
+#endif // AOCL_LPGEMM_MATH_UTILS_AVX2_H
diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c
new file mode 100644
index 0000000000..8b41f0e6da
--- /dev/null
+++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_6x32rowmajor_amd256.c
@@ -0,0 +1,934 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "../u8s8s16/lpgemm_s16_kern_macros.h"
+
+// 6x32 int8o16 kernel
+LPGEMM_MAIN_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x32)
+{
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_6x32_DISABLE,
+			&&POST_OPS_BIAS_6x32,
+			&&POST_OPS_RELU_6x32,
+			&&POST_OPS_RELU_SCALE_6x32,
+			&&POST_OPS_GELU_TANH_6x32,
+			&&POST_OPS_GELU_ERF_6x32,
+			&&POST_OPS_CLIP_6x32,
+			&&POST_OPS_DOWNSCALE_6x32
+		};
+
+	dim_t MR = 6;
+	dim_t NR = 32;
+
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// When n fringe cases are encountered
+	if (n0 < NR)
+	{
+		// Split into multiple smaller fringe kernels, so as to maximize
+		// vectorization after packing. Any n0 < NR(32) can be expressed
+		// as n0 = 16 + n`.
+		dim_t n0_rem = n0 % 16;
+		dim_t n0_16 = n0 / 16;
+		dim_t k0_updated = k0;
+
+		// Making multiple of 2 to suit k in vpmaddubsw
+		k0_updated += (k0_updated & 0x1);
+
+		if (n0_16 == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_6x16(
+				m0, k0,
+				a, rs_a, cs_a, ps_a,
+				b, ((rs_b / 2) * 1), cs_b,
+				c, rs_c,
+				alpha, beta,
+				post_ops_list, post_ops_attr);
+
+			b = b + (16 * k0_updated);
+			c = c + 16;
+			post_ops_attr.post_op_c_j += 16;
+			post_ops_attr.b_sum_offset += 16;
+		}
+
+		if (n0_rem > 0)
+		{
+			lpgemm_rowvar_s8s8s16o16_6xlt16(
+				m0, k0,
+				a, rs_a, cs_a, ps_a,
+				b, ((rs_b / 2) * 1), cs_b,
+				c, rs_c,
+				alpha, beta, n0_rem,
+				post_ops_list, post_ops_attr);
+		}
+
+		// If fringe cases are encountered, return early
+		return;
+	}
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	for (dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR)
+	{
+
+		_mm256_zeroupper();
+
+		// Registers to use for accumulating C.
+		__m256i c_int16_0p0 = _mm256_setzero_si256();
+		__m256i c_int16_0p1 = _mm256_setzero_si256();
+
+		__m256i c_int16_1p0 = _mm256_setzero_si256();
+		__m256i c_int16_1p1 = _mm256_setzero_si256();
+
+		__m256i c_int16_2p0 = _mm256_setzero_si256();
+		__m256i c_int16_2p1 = _mm256_setzero_si256();
+
+		__m256i c_int16_3p0 = _mm256_setzero_si256();
+		__m256i c_int16_3p1 = _mm256_setzero_si256();
+
+		__m256i c_int16_4p0 = _mm256_setzero_si256();
+		__m256i c_int16_4p1 = _mm256_setzero_si256();
+
+		__m256i c_int16_5p0 = _mm256_setzero_si256();
+		__m256i c_int16_5p1 = _mm256_setzero_si256();
+
+		for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+		{
+			dim_t offset = kr * 2;
+
+			// Broadcast a[0,kr:kr+2].
+			__m256i a_int32_0 =
+					_mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0)
+											+ (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			__m256i b0 = 
+					_mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0)));
+			__m256i b1 = 
+					_mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
+
+			// Seperate register for intermediate op
+			__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_0p1 = _mm256_add_epi16(inter_vec, c_int16_0p1);
+
+			// Broadcast a[1,kr:kr+2].
+			a_int32_0 =
+				_mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );    
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_1p1 = _mm256_add_epi16(inter_vec, c_int16_1p1);
+
+			// Broadcast a[2,kr:kr+2].
+			a_int32_0 = 
+				_mm256_set1_epi16(*(int16_t *)(a + (rs_a * 2) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );    
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_2p1 = _mm256_add_epi16(inter_vec, c_int16_2p1);
+
+			// Broadcast a[3,kr:kr+2].
+			a_int32_0 = 
+				_mm256_set1_epi16(*(int16_t *)(a + (rs_a * 3) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );    
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_3p1 = _mm256_add_epi16(inter_vec, c_int16_3p1);
+
+			// Broadcast a[4,kr:kr+2].
+			a_int32_0 =
+				_mm256_set1_epi16(*(int16_t *)(a + (rs_a * 4) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );    
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+4,0-31]
+			c_int16_4p0 = _mm256_add_epi16(inter_vec, c_int16_4p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+
+			c_int16_4p1 = _mm256_add_epi16(inter_vec, c_int16_4p1);
+
+			// Broadcast a[5,kr:kr+2].
+			a_int32_0 = 
+				_mm256_set1_epi16(*(int16_t *)(a + (rs_a * 5) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );    
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+4,0-31]
+			c_int16_5p0 = _mm256_add_epi16(inter_vec, c_int16_5p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_5p1 = _mm256_add_epi16(inter_vec, c_int16_5p1);
+		}
+
+		// Handle k remainder.
+		if (k_partial_pieces > 0)
+		{
+
+			__m256i b0 = _mm256_loadu_si256((__m256i const *)
+							(b + (64 * k_full_pieces) + (NR * 0)));
+			__m256i b1 = _mm256_loadu_si256((__m256i const *)
+							(b + (64 * k_full_pieces) + (NR * 1)));
+
+			int8_t a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+			__m256i a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			__m256i inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_0p1 = _mm256_add_epi16(inter_vec, c_int16_0p1);
+
+			a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+4,0-31]
+			c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_1p1 = _mm256_add_epi16(inter_vec, c_int16_1p1);
+
+			a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+
+			c_int16_2p1 = _mm256_add_epi16(inter_vec, c_int16_2p1);
+
+			a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_3p1 = _mm256_add_epi16(inter_vec, c_int16_3p1);
+
+			a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_4p0 = _mm256_add_epi16(inter_vec, c_int16_4p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_4p1 = _mm256_add_epi16(inter_vec, c_int16_4p1);
+
+			a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+			a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_5p0 = _mm256_add_epi16(inter_vec, c_int16_5p0);
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b1);
+			c_int16_5p1 = _mm256_add_epi16(inter_vec, c_int16_5p1);
+		}
+        if ( post_ops_attr.is_last_k == 1 )
+		{
+            //Subtract B matrix sum column values to compensate 
+			//for addition of 128 to A matrix elements
+
+            int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+            __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+            c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+			c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+			c_int16_2p0 = _mm256_sub_epi16( c_int16_2p0 , b0 );
+			c_int16_3p0 = _mm256_sub_epi16( c_int16_3p0 , b0 );
+			c_int16_4p0 = _mm256_sub_epi16( c_int16_4p0 , b0 );
+			c_int16_5p0 = _mm256_sub_epi16( c_int16_5p0 , b0 );
+
+            b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr + 16) );
+
+            c_int16_0p1 = _mm256_sub_epi16( c_int16_0p1 , b0 );
+			c_int16_1p1 = _mm256_sub_epi16( c_int16_1p1 , b0 );
+			c_int16_2p1 = _mm256_sub_epi16( c_int16_2p1 , b0 );
+			c_int16_3p1 = _mm256_sub_epi16( c_int16_3p1 , b0 );
+			c_int16_4p1 = _mm256_sub_epi16( c_int16_4p1 , b0 );
+			c_int16_5p1 = _mm256_sub_epi16( c_int16_5p1 , b0 );
+        }
+
+		// Load alpha and beta
+		__m256i alphav = _mm256_set1_epi16(alpha);
+		__m256i betav = _mm256_set1_epi16(beta);
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int16_0p0 = _mm256_mullo_epi16(alphav, c_int16_0p0);
+			c_int16_0p1 = _mm256_mullo_epi16(alphav, c_int16_0p1);
+
+			c_int16_1p0 = _mm256_mullo_epi16(alphav, c_int16_1p0);
+			c_int16_1p1 = _mm256_mullo_epi16(alphav, c_int16_1p1);
+
+			c_int16_2p0 = _mm256_mullo_epi16(alphav, c_int16_2p0);
+			c_int16_2p1 = _mm256_mullo_epi16(alphav, c_int16_2p1);
+
+			c_int16_3p0 = _mm256_mullo_epi16(alphav, c_int16_3p0);
+			c_int16_3p1 = _mm256_mullo_epi16(alphav, c_int16_3p1);
+
+			c_int16_4p0 = _mm256_mullo_epi16(alphav, c_int16_4p0);
+			c_int16_4p1 = _mm256_mullo_epi16(alphav, c_int16_4p1);
+
+			c_int16_5p0 = _mm256_mullo_epi16(alphav, c_int16_5p0);
+			c_int16_5p1 = _mm256_mullo_epi16(alphav, c_int16_5p1);
+		}
+
+		// Scale C by beta.
+		if (beta != 0)
+		{
+			// For the downscaled api (C-s8), the output C matrix values
+			// needs to be upscaled to s16 to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				S8_S16_BETA_OP(c_int16_0p0,ir,0,0,alphav,betav)
+
+				// c[0, 16-31]
+				S8_S16_BETA_OP(c_int16_0p1,ir,0,1,alphav,betav)
+
+				// c[1,0-15]
+				S8_S16_BETA_OP(c_int16_1p0,ir,1,0,alphav,betav)
+
+				// c[1,16-31]
+				S8_S16_BETA_OP(c_int16_1p1,ir,1,1,alphav,betav)
+
+				// c[2,0-15]
+				S8_S16_BETA_OP(c_int16_2p0,ir,2,0,alphav,betav)
+
+				// c[2,16-31]
+				S8_S16_BETA_OP(c_int16_2p1,ir,2,1,alphav,betav)
+
+				// c[3,0-15]
+				S8_S16_BETA_OP(c_int16_3p0,ir,3,0,alphav,betav)
+
+				// c[3,16-31]
+				S8_S16_BETA_OP(c_int16_3p1,ir,3,1,alphav,betav)
+
+				// c[4,0-15]
+				S8_S16_BETA_OP(c_int16_4p0,ir,4,0,alphav,betav)
+
+				// c[4,16-31]
+				S8_S16_BETA_OP(c_int16_4p1,ir,4,1,alphav,betav)
+
+				// c[5,0-15]
+				S8_S16_BETA_OP(c_int16_5p0,ir,5,0,alphav,betav)
+
+				// c[5,16-31]
+				S8_S16_BETA_OP(c_int16_5p1,ir,5,1,alphav,betav)
+			}
+			else
+			{
+				// c[0,0-15]
+				S16_S16_BETA_OP(c_int16_0p0,ir,0,0,alphav,betav)
+
+				// c[0, 16-31]
+				S16_S16_BETA_OP(c_int16_0p1,ir,0,1,alphav,betav)
+
+				// c[1,0-15]
+				S16_S16_BETA_OP(c_int16_1p0,ir,1,0,alphav,betav)
+
+				// c[1,16-31]
+				S16_S16_BETA_OP(c_int16_1p1,ir,1,1,alphav,betav)
+
+				// c[2,0-15]
+				S16_S16_BETA_OP(c_int16_2p0,ir,2,0,alphav,betav)
+
+				// c[2,16-31]
+				S16_S16_BETA_OP(c_int16_2p1,ir,2,1,alphav,betav)
+
+				// c[3,0-15]
+				S16_S16_BETA_OP(c_int16_3p0,ir,3,0,alphav,betav)
+
+				// c[3,16-31]
+				S16_S16_BETA_OP(c_int16_3p1,ir,3,1,alphav,betav)
+
+				// c[4,0-15]
+				S16_S16_BETA_OP(c_int16_4p0,ir,4,0,alphav,betav)
+
+				// c[4,16-31]
+				S16_S16_BETA_OP(c_int16_4p1,ir,4,1,alphav,betav)
+
+				// c[5,0-15]
+				S16_S16_BETA_OP(c_int16_5p0,ir,5,0,alphav,betav)
+
+				// c[5,16-31]
+				S16_S16_BETA_OP(c_int16_5p1,ir,5,1,alphav,betav)
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x32:
+		{
+			__m256i selector1 =
+				_mm256_loadu_si256( (__m256i const *)(
+					(int16_t *)post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+			__m256i selector2 =
+				_mm256_loadu_si256( (__m256i const *)(
+					(int16_t *)post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 )) );
+
+			// c[0,0-15]
+			c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+
+			// c[0, 16-31]
+			c_int16_0p1 = _mm256_add_epi16( selector2, c_int16_0p1 );
+
+			// c[1,0-15]
+			c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+			// c[1, 16-31]
+			c_int16_1p1 = _mm256_add_epi16( selector2, c_int16_1p1 );
+
+			// c[2,0-15]
+			c_int16_2p0 = _mm256_add_epi16( selector1, c_int16_2p0 );
+
+			// c[2, 16-31]
+			c_int16_2p1 = _mm256_add_epi16( selector2, c_int16_2p1 );
+
+			// c[3,0-15]
+			c_int16_3p0 = _mm256_add_epi16( selector1, c_int16_3p0 );
+
+			// c[3, 16-31]
+			c_int16_3p1 = _mm256_add_epi16( selector2, c_int16_3p1 );
+
+			// c[4,0-15]
+			c_int16_4p0 = _mm256_add_epi16( selector1, c_int16_4p0 );
+
+			// c[4, 16-31]
+			c_int16_4p1 = _mm256_add_epi16( selector2, c_int16_4p1 );
+
+			// c[5,0-15]
+			c_int16_5p0 = _mm256_add_epi16( selector1, c_int16_5p0 );
+
+			// c[5, 16-31]
+			c_int16_5p1 = _mm256_add_epi16( selector2, c_int16_5p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x32:
+		{
+			__m256i selector1 = _mm256_setzero_si256 ();
+
+			// c[0,0-15]
+			c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+			// c[0, 16-31]
+			c_int16_0p1 = _mm256_max_epi16( selector1, c_int16_0p1 );
+
+			// c[1,0-15]
+			c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+			// c[1,16-31]
+			c_int16_1p1 = _mm256_max_epi16( selector1, c_int16_1p1 );
+
+			// c[2,0-15]
+			c_int16_2p0 = _mm256_max_epi16( selector1, c_int16_2p0 );
+
+			// c[2,16-31]
+			c_int16_2p1 = _mm256_max_epi16( selector1, c_int16_2p1 );
+
+			// c[3,0-15]
+			c_int16_3p0 = _mm256_max_epi16( selector1, c_int16_3p0 );
+
+			// c[3,16-31]
+			c_int16_3p1 = _mm256_max_epi16( selector1, c_int16_3p1 );
+
+			// c[4,0-15]
+			c_int16_4p0 = _mm256_max_epi16( selector1, c_int16_4p0 );
+
+			// c[4,16-31]
+			c_int16_4p1 = _mm256_max_epi16( selector1, c_int16_4p1 );
+
+			// c[5,0-15]
+			c_int16_5p0 = _mm256_max_epi16( selector1, c_int16_5p0 );
+
+			// c[5,16-31]
+			c_int16_5p1 = _mm256_max_epi16( selector1, c_int16_5p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x32:
+		{
+			__m256i selector2 =
+				_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+			__m256i selector1, b0;
+
+			// c[0,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+			// c[0,16-31]
+			RELU_SCALE_OP_S16_AVX2(c_int16_0p1)
+
+			// c[1,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+			// c[1,16-31]
+			RELU_SCALE_OP_S16_AVX2(c_int16_1p1)
+
+			// c[2,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_2p0)
+
+			// c[2,16-31]
+			RELU_SCALE_OP_S16_AVX2(c_int16_2p1)
+
+			// c[3,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_3p0)
+
+			// c[3,16-31]
+			RELU_SCALE_OP_S16_AVX2(c_int16_3p1)
+
+			// c[4,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_4p0)
+
+			// c[4,16-31]
+			RELU_SCALE_OP_S16_AVX2(c_int16_4p1)
+
+			// c[5,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_5p0)
+
+			// c[5,16-31]
+			RELU_SCALE_OP_S16_AVX2(c_int16_5p1)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x32:
+		{
+			__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+			__m256i q;
+
+			// c[0,0-15]
+			GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0,16-31]
+			GELU_TANH_S16_AVX2(c_int16_0p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1,0-15]
+			GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1,16-31]
+			GELU_TANH_S16_AVX2(c_int16_1p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2,0-15]
+			GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2,16-31]
+			GELU_TANH_S16_AVX2(c_int16_2p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3,0-15]
+			GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3,16-31]
+			GELU_TANH_S16_AVX2(c_int16_3p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4,0-15]
+			GELU_TANH_S16_AVX2(c_int16_4p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4,16-31]
+			GELU_TANH_S16_AVX2(c_int16_4p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5,0-15]
+			GELU_TANH_S16_AVX2(c_int16_5p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5,16-31]
+			GELU_TANH_S16_AVX2(c_int16_5p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x32:
+		{
+			__m256 x, r, y1, y2, x_erf;
+
+			// c[0,0-15]
+			GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+			// c[0,16-31]
+			GELU_ERF_S16_AVX2(c_int16_0p1, y1, y2, r, x, x_erf)
+
+			// c[1,0-15]
+			GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+			// c[1,16-31]
+			GELU_ERF_S16_AVX2(c_int16_1p1, y1, y2, r, x, x_erf)
+
+			// c[2,0-15]
+			GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+			// c[2,16-31]
+			GELU_ERF_S16_AVX2(c_int16_2p1, y1, y2, r, x, x_erf)
+
+			// c[3,0-15]
+			GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+			// c[3,16-31]
+			GELU_ERF_S16_AVX2(c_int16_3p1, y1, y2, r, x, x_erf)
+
+			// c[4,0-15]
+			GELU_ERF_S16_AVX2(c_int16_4p0, y1, y2, r, x, x_erf)
+
+			// c[4,16-31]
+			GELU_ERF_S16_AVX2(c_int16_4p1, y1, y2, r, x, x_erf)
+
+			// c[5,0-15]
+			GELU_ERF_S16_AVX2(c_int16_5p0, y1, y2, r, x, x_erf)
+
+			// c[5,16-31]
+			GELU_ERF_S16_AVX2(c_int16_5p1, y1, y2, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x32:
+		{
+			__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+			__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+			// c[0,0-15]
+			CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+			// c[0,16-31]
+			CLIP_S16_AVX2(c_int16_0p1, min, max)
+
+			// c[1,0-15]
+			CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+			// c[1,16-31]
+			CLIP_S16_AVX2(c_int16_1p1, min, max)
+
+			// c[2,0-15]
+			CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+			// c[2,16-31]
+			CLIP_S16_AVX2(c_int16_2p1, min, max)
+
+			// c[3,0-15]
+			CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+			// c[3,16-31]
+			CLIP_S16_AVX2(c_int16_3p1, min, max)
+
+			// c[4,0-15]
+			CLIP_S16_AVX2(c_int16_4p0, min, max)
+
+			// c[4,16-31]
+			CLIP_S16_AVX2(c_int16_4p1, min, max)
+
+			// c[5,0-15]
+			CLIP_S16_AVX2(c_int16_5p0, min, max)
+
+			// c[5,16-31]
+			CLIP_S16_AVX2(c_int16_5p1, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x32:
+		{
+			__m128i temp[2];
+			__m256i temp_32[2];
+			__m256 temp_float[2];
+			__m256 scale_1, scale_2;
+			__m256 res_1, res_2;
+
+			/* Load the scale vector values into the register*/
+			scale_1 =
+				_mm256_loadu_ps(
+				(float *)post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j + (0 * 8));
+			scale_2 =
+				_mm256_loadu_ps(
+				(float *)post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j + (1 * 8));
+
+			// Scale first 16 columns of the 6 rows.
+			CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2)
+
+			scale_1 =
+				_mm256_loadu_ps(
+				(float *)post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j + (2 * 8));
+			scale_2 =
+				_mm256_loadu_ps(
+				(float *)post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j + (3 * 8));
+
+			// Scale next 16 columns of the 6 rows.
+			CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_4p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_5p1, scale_1, scale_2)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_6x32_DISABLE:
+		;
+
+		// Case where the output C matrix is s8 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-31]
+			CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0);
+
+			// c[1,0-31]
+			CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0);
+
+			// c[2,0-31]
+			CVT_STORE_S16_S8(c_int16_2p0, c_int16_2p1, 2, 0);
+
+			// c[3,0-31]
+			CVT_STORE_S16_S8(c_int16_3p0, c_int16_3p1, 3, 0);
+
+			// c[4,0-31]
+			CVT_STORE_S16_S8(c_int16_4p0, c_int16_4p1, 4, 0);
+
+			// c[5,0-31]
+			CVT_STORE_S16_S8(c_int16_5p0, c_int16_5p1, 5, 0);
+		}
+		// Case where the output C matrix is s16 or is the temp buffer used to
+		// store intermediate s16 accumulated values for downscaled (C-s8) api.
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 0 ) ) + ( 0*16 )), c_int16_0p0 );
+
+			// c[0, 16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 0 ) ) + ( 1*16 )), c_int16_0p1 );
+
+			// c[1,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 0*16 )), c_int16_1p0 );
+
+			// c[1,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 1*16 )), c_int16_1p1 );
+
+			// c[2,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 0*16 )), c_int16_2p0 );
+
+			// c[2,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 1*16 )), c_int16_2p1 );
+
+			// c[3,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 0*16 )), c_int16_3p0 );
+
+			// c[3,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 1*16 )), c_int16_3p1 );
+
+			// c[4,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 0*16 )), c_int16_4p0 );
+
+			// c[4,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 1*16 )), c_int16_4p1 );
+
+			// c[5,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 0*16 )), c_int16_5p0 );
+
+			// c[5,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 1*16 )), c_int16_5p1 );
+		}
+		
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if (m_partial_pieces > 0)
+	{
+		// Split into multiple smaller fringe kernels, so as to maximize
+		// vectorization after packing. Any m0 < MR(6) can be expressed
+		// as a combination of numbers from the set {4, 2, 1}.
+		dim_t m_partial4 = m_partial_pieces / 4;
+		m_partial_pieces = m_partial_pieces % 4;
+
+		dim_t m_partial2 = m_partial_pieces / 2;
+		dim_t m_partial = m_partial_pieces % 2;
+
+		if (m_partial4 == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_4x32(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta,
+				post_ops_list, post_ops_attr);
+
+			// a pointer increment
+			a = a + (4 * ps_a);
+			m_full_pieces_loop_limit += 4;
+			post_ops_attr.post_op_c_i += 4;
+		}
+
+		if (m_partial2 == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_2x32(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta,
+				post_ops_list, post_ops_attr);
+
+			// a pointer increment
+			a = a + (2 * ps_a);
+			m_full_pieces_loop_limit += 2;
+			post_ops_attr.post_op_c_i += 2;
+		}
+
+		if (m_partial == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_1x32(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta,
+				post_ops_list, post_ops_attr);
+			post_ops_attr.post_op_c_i += 1;
+		}
+	}
+}
+#endif
diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c
new file mode 100644
index 0000000000..8d0bea859b
--- /dev/null
+++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_m_fringe_amd256.c
@@ -0,0 +1,1261 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "../u8s8s16/lpgemm_s16_kern_macros.h"
+
+// 4x32 int8o16 kernel
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x32)
+{
+	dim_t NR = 32;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_4x32_DISABLE,
+			&&POST_OPS_BIAS_4x32,
+			&&POST_OPS_RELU_4x32,
+			&&POST_OPS_RELU_SCALE_4x32,
+			&&POST_OPS_GELU_TANH_4x32,
+			&&POST_OPS_GELU_ERF_4x32,
+			&&POST_OPS_CLIP_4x32,
+			&&POST_OPS_DOWNSCALE_4x32
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+	__m256i b1;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i a_int32_1;
+	__m256i inter_vec[4];
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+	__m256i c_int16_0p1 = _mm256_setzero_si256();
+
+	__m256i c_int16_1p0 = _mm256_setzero_si256();
+	__m256i c_int16_1p1 = _mm256_setzero_si256();
+
+	__m256i c_int16_2p0 = _mm256_setzero_si256();
+	__m256i c_int16_2p1 = _mm256_setzero_si256();
+
+	__m256i c_int16_3p0 = _mm256_setzero_si256();
+	__m256i c_int16_3p1 = _mm256_setzero_si256();
+
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		dim_t offset = kr * 2;
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0)));
+		b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
+
+		// Broadcast a[1,kr:kr+2].
+		a_int32_1 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
+		inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
+		c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
+
+		// Broadcast a[2,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 2) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
+		inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec[2], c_int16_1p0);
+		c_int16_1p1 = _mm256_add_epi16(inter_vec[3], c_int16_1p1);
+
+		// Broadcast a[3,kr:kr+2].
+		a_int32_1 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 3) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
+		inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_2p0 = _mm256_add_epi16(inter_vec[0], c_int16_2p0);
+		c_int16_2p1 = _mm256_add_epi16(inter_vec[1], c_int16_2p1);
+
+		// Seperate register for intermediate op
+		inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
+		inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_3p0 = _mm256_add_epi16(inter_vec[2], c_int16_3p0);
+		c_int16_3p1 = _mm256_add_epi16(inter_vec[3], c_int16_3p1);
+	}
+
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * k_full_pieces) + (NR * 0)));
+		b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * k_full_pieces) + (NR * 1)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
+		inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
+		c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
+
+		a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+		a_int32_1 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
+		inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec[2], c_int16_1p0);
+		c_int16_1p1 = _mm256_add_epi16(inter_vec[3], c_int16_1p1);
+
+		a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
+		inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int16_2p0 = _mm256_add_epi16(inter_vec[0], c_int16_2p0);
+		c_int16_2p1 = _mm256_add_epi16(inter_vec[1], c_int16_2p1);
+
+		a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
+		a_int32_1 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
+		inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_3p0 = _mm256_add_epi16(inter_vec[2], c_int16_3p0);
+		c_int16_3p1 = _mm256_add_epi16(inter_vec[3], c_int16_3p1);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+        c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+        c_int16_2p0 = _mm256_sub_epi16( c_int16_2p0 , b0 );
+        c_int16_3p0 = _mm256_sub_epi16( c_int16_3p0 , b0 );
+
+        b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr + 16) );
+
+        c_int16_0p1 = _mm256_sub_epi16( c_int16_0p1 , b0 );
+        c_int16_1p1 = _mm256_sub_epi16( c_int16_1p1 , b0 );
+        c_int16_2p1 = _mm256_sub_epi16( c_int16_2p1 , b0 );
+        c_int16_3p1 = _mm256_sub_epi16( c_int16_3p1 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+		c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
+
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+		c_int16_1p1 = _mm256_mullo_epi16(selector1, c_int16_1p1);
+
+		c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+		c_int16_2p1 = _mm256_mullo_epi16(selector1, c_int16_2p1);
+
+		c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+		c_int16_3p1 = _mm256_mullo_epi16(selector1, c_int16_3p1);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+
+			// c[1,0-15]
+			S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			S8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2)
+
+			// c[2,0-15]
+			S8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			S8_S16_BETA_OP(c_int16_2p1,0,2,1,selector1,selector2)
+
+			// c[3,0-15]
+			S8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2)
+
+			// c[3,16-31]
+			S8_S16_BETA_OP(c_int16_3p1,0,3,1,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S16_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			S16_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2)
+
+			// c[2,0-15]
+			S16_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			S16_S16_BETA_OP(c_int16_2p1,0,2,1,selector1,selector2)
+
+			// c[3,0-15]
+			S16_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2)
+
+			// c[3,16-31]
+			S16_S16_BETA_OP(c_int16_3p1,0,3,1,selector1,selector2)
+		}
+	}
+	
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x32:
+	{
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+		selector2 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 )) );
+		
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		// c[0, 16-31]
+		c_int16_0p1 = _mm256_add_epi16( selector2, c_int16_0p1 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+		// c[1, 16-31]
+		c_int16_1p1 = _mm256_add_epi16( selector2, c_int16_1p1 );
+
+		// c[2,0-15]
+		c_int16_2p0 = _mm256_add_epi16( selector1, c_int16_2p0 );
+
+		// c[2, 16-31]
+		c_int16_2p1 = _mm256_add_epi16( selector2, c_int16_2p1 );
+
+		// c[3,0-15]
+		c_int16_3p0 = _mm256_add_epi16( selector1, c_int16_3p0 );
+
+		// c[3, 16-31]
+		c_int16_3p1 = _mm256_add_epi16( selector2, c_int16_3p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x32:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		// c[0, 16-31]
+		c_int16_0p1 = _mm256_max_epi16( selector1, c_int16_0p1 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+		// c[1,16-31]
+		c_int16_1p1 = _mm256_max_epi16( selector1, c_int16_1p1 );
+
+		// c[2,0-15]
+		c_int16_2p0 = _mm256_max_epi16( selector1, c_int16_2p0 );
+
+		// c[2,16-31]
+		c_int16_2p1 = _mm256_max_epi16( selector1, c_int16_2p1 );
+
+		// c[3,0-15]
+		c_int16_3p0 = _mm256_max_epi16( selector1, c_int16_3p0 );
+
+		// c[3,16-31]
+		c_int16_3p1 = _mm256_max_epi16( selector1, c_int16_3p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x32:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		// c[0,16-31]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p1)
+
+		// c[1,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+		// c[1,16-31]
+		RELU_SCALE_OP_S16_AVX2(c_int16_1p1)
+
+		// c[2,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_2p0)
+
+		// c[2,16-31]
+		RELU_SCALE_OP_S16_AVX2(c_int16_2p1)
+
+		// c[3,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_3p0)
+
+		// c[3,16-31]
+		RELU_SCALE_OP_S16_AVX2(c_int16_3p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x32:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0,16-31]
+		GELU_TANH_S16_AVX2(c_int16_0p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,16-31]
+		GELU_TANH_S16_AVX2(c_int16_1p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2,0-15]
+		GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2,16-31]
+		GELU_TANH_S16_AVX2(c_int16_2p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3,0-15]
+		GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3,16-31]
+		GELU_TANH_S16_AVX2(c_int16_3p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x32:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[0,16-31]
+		GELU_ERF_S16_AVX2(c_int16_0p1, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		// c[1,16-31]
+		GELU_ERF_S16_AVX2(c_int16_1p1, y1, y2, r, x, x_erf)
+
+		// c[2,0-15]
+		GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+		// c[2,16-31]
+		GELU_ERF_S16_AVX2(c_int16_2p1, y1, y2, r, x, x_erf)
+
+		// c[3,0-15]
+		GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+		// c[3,16-31]
+		GELU_ERF_S16_AVX2(c_int16_3p1, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x32:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[0,16-31]
+		CLIP_S16_AVX2(c_int16_0p1, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		// c[1,16-31]
+		CLIP_S16_AVX2(c_int16_1p1, min, max)
+
+		// c[2,0-15]
+		CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+		// c[2,16-31]
+		CLIP_S16_AVX2(c_int16_2p1, min, max)
+
+		// c[3,0-15]
+		CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+		// c[3,16-31]
+		CLIP_S16_AVX2(c_int16_3p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x32:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		/* Load the scale vector values into the register*/
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (0 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (1 * 8));
+
+		// Scale first 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (2 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (3 * 8));
+
+		// Scale next 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x32_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-31]
+		CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0);
+
+		// c[1,0-31]
+		CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0);
+
+		// c[2,0-31]
+		CVT_STORE_S16_S8(c_int16_2p0, c_int16_2p1, 2, 0);
+
+		// c[3,0-31]
+		CVT_STORE_S16_S8(c_int16_3p0, c_int16_3p1, 3, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0*16 )), c_int16_0p0 );
+
+		// c[0, 16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
+
+		// c[1,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0*16 )), c_int16_1p0 );
+
+		// c[1,16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 1*16 )), c_int16_1p1 );
+
+		// c[2,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 2 ) + ( 0*16 )), c_int16_2p0 );
+
+		// c[2,16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 2 ) + ( 1*16 )), c_int16_2p1 );
+
+		// c[3,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 3 ) + ( 0*16 )), c_int16_3p0 );
+
+		// c[3,16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 3 ) + ( 1*16 )), c_int16_3p1 );
+	}
+}
+
+
+// 2x32 int8o16 kernel
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x32)
+{
+	dim_t NR = 32;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_2x32_DISABLE,
+			&&POST_OPS_BIAS_2x32,
+			&&POST_OPS_RELU_2x32,
+			&&POST_OPS_RELU_SCALE_2x32,
+			&&POST_OPS_GELU_TANH_2x32,
+			&&POST_OPS_GELU_ERF_2x32,
+			&&POST_OPS_CLIP_2x32,
+			&&POST_OPS_DOWNSCALE_2x32
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+	__m256i b1;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i a_int32_1;
+	__m256i inter_vec[4];
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+	__m256i c_int16_0p1 = _mm256_setzero_si256();
+
+	__m256i c_int16_1p0 = _mm256_setzero_si256();
+	__m256i c_int16_1p1 = _mm256_setzero_si256();
+
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		dim_t offset = kr * 2;
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0)));
+		b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
+
+		// Broadcast a[1,kr:kr+2].
+		a_int32_1 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
+		inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
+		c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
+
+		// Seperate register for intermediate op
+		inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
+		inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec[2], c_int16_1p0);
+		c_int16_1p1 = _mm256_add_epi16(inter_vec[3], c_int16_1p1);
+	}
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * k_full_pieces) + (NR * 0)));
+		b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * k_full_pieces) + (NR * 1)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
+		inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
+		c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
+
+		a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+		a_int32_1 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_1 = _mm256_add_epi8( a_int32_1, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[2] = _mm256_maddubs_epi16(a_int32_1, b0);
+		inter_vec[3] = _mm256_maddubs_epi16(a_int32_1, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec[2], c_int16_1p0);
+		c_int16_1p1 = _mm256_add_epi16(inter_vec[3], c_int16_1p1);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+        c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+
+        b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr + 16) );
+
+        c_int16_0p1 = _mm256_sub_epi16( c_int16_0p1 , b0 );
+        c_int16_1p1 = _mm256_sub_epi16( c_int16_1p1 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+		c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
+
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+		c_int16_1p1 = _mm256_mullo_epi16(selector1, c_int16_1p1);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+
+			// c[1,0-15]
+			S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			S8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S16_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			S16_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2)
+		}
+	}
+
+		// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x32:
+	{
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+		selector2 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 )) );
+		
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		// c[0, 16-31]
+		c_int16_0p1 = _mm256_add_epi16( selector2, c_int16_0p1 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+		// c[1, 16-31]
+		c_int16_1p1 = _mm256_add_epi16( selector2, c_int16_1p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x32:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		// c[0, 16-31]
+		c_int16_0p1 = _mm256_max_epi16( selector1, c_int16_0p1 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+		// c[1,16-31]
+		c_int16_1p1 = _mm256_max_epi16( selector1, c_int16_1p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x32:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		// c[0,16-31]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p1)
+
+		// c[1,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+		// c[1,16-31]
+		RELU_SCALE_OP_S16_AVX2(c_int16_1p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x32:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0,16-31]
+		GELU_TANH_S16_AVX2(c_int16_0p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,16-31]
+		GELU_TANH_S16_AVX2(c_int16_1p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x32:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[0,16-31]
+		GELU_ERF_S16_AVX2(c_int16_0p1, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		// c[1,16-31]
+		GELU_ERF_S16_AVX2(c_int16_1p1, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x32:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[0,16-31]
+		CLIP_S16_AVX2(c_int16_0p1, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		// c[1,16-31]
+		CLIP_S16_AVX2(c_int16_1p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x32:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		/* Load the scale vector values into the register*/
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (0 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (1 * 8));
+
+		// Scale first 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (2 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (3 * 8));
+
+		// Scale next 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x32_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-31]
+		CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0);
+
+		// c[1,0-31]
+		CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0*16 )), c_int16_0p0 );
+
+		// c[0, 16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
+
+		// c[1,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0*16 )), c_int16_1p0 );
+
+		// c[1,16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 1*16 )), c_int16_1p1 );
+	}
+}
+
+// 1x32 int8o16 kernel
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x32)
+{
+	dim_t NR = 32;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_1x32_DISABLE,
+			&&POST_OPS_BIAS_1x32,
+			&&POST_OPS_RELU_1x32,
+			&&POST_OPS_RELU_SCALE_1x32,
+			&&POST_OPS_GELU_TANH_1x32,
+			&&POST_OPS_GELU_ERF_1x32,
+			&&POST_OPS_CLIP_1x32,
+			&&POST_OPS_DOWNSCALE_1x32
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+	__m256i b1;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec[2];
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+	__m256i c_int16_0p1 = _mm256_setzero_si256();
+
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		dim_t offset = kr * 2;
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 0)));
+		b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * kr) + (NR * 1)));
+
+		// Seperate register for intermediate op
+		inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
+		inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
+		c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
+	}
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (64 * k_full_pieces) + (NR * 0)));
+		b1 = _mm256_loadu_si256((__m256i const *)(b + (64 * k_full_pieces) + (NR * 1)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec[0] = _mm256_maddubs_epi16(a_int32_0, b0);
+		inter_vec[1] = _mm256_maddubs_epi16(a_int32_0, b1);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec[0], c_int16_0p0);
+		c_int16_0p1 = _mm256_add_epi16(inter_vec[1], c_int16_0p1);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+
+        b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr + 16) );
+
+        c_int16_0p1 = _mm256_sub_epi16( c_int16_0p1 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+		c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S16_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+		}
+	}
+
+		// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x32:
+	{
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+		selector2 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 )) );
+		
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		// c[0, 16-31]
+		c_int16_0p1 = _mm256_add_epi16( selector2, c_int16_0p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x32:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		// c[0, 16-31]
+		c_int16_0p1 = _mm256_max_epi16( selector1, c_int16_0p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x32:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		// c[0,16-31]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x32:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0,16-31]
+		GELU_TANH_S16_AVX2(c_int16_0p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x32:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[0,16-31]
+		GELU_ERF_S16_AVX2(c_int16_0p1, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x32:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[0,16-31]
+		CLIP_S16_AVX2(c_int16_0p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x32:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		/* Load the scale vector values into the register*/
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (0 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (1 * 8));
+
+		// Scale first 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (2 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (3 * 8));
+
+		// Scale next 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x32_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-31]
+		CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0*16 )), c_int16_0p0 );
+
+		// c[0, 16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
+	}
+}
+#endif
diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c
new file mode 100644
index 0000000000..79fa0bcd3f
--- /dev/null
+++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_mn_fringe_amd256.c
@@ -0,0 +1,1943 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "../u8s8s16/lpgemm_s16_kern_macros.h"
+
+// 4x32 int8o16 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4x16)
+{
+	dim_t NR = 16;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_4x16_DISABLE,
+			&&POST_OPS_BIAS_4x16,
+			&&POST_OPS_RELU_4x16,
+			&&POST_OPS_RELU_SCALE_4x16,
+			&&POST_OPS_GELU_TANH_4x16,
+			&&POST_OPS_GELU_ERF_4x16,
+			&&POST_OPS_CLIP_4x16,
+			&&POST_OPS_DOWNSCALE_4x16
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec;
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+	__m256i c_int16_1p0 = _mm256_setzero_si256();
+	__m256i c_int16_2p0 = _mm256_setzero_si256();
+	__m256i c_int16_3p0 = _mm256_setzero_si256();
+
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		dim_t offset = kr * 2;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+		// Broadcast a[1,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+		// Broadcast a[2,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 2) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+		// Broadcast a[3,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 3) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+	}
+
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * k_full_pieces) + (NR * 0)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+		a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+		a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+		a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+        c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+        c_int16_2p0 = _mm256_sub_epi16( c_int16_2p0 , b0 );
+        c_int16_3p0 = _mm256_sub_epi16( c_int16_3p0 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+
+		c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+
+		c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[1,0-15]
+			S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[2,0-15]
+			S8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2)
+
+			// c[3,0-15]
+			S8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[2,0-15]
+			S16_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2)
+
+			// c[3,0-15]
+			S16_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2)
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x16:
+	{
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+		// c[2,0-15]
+		c_int16_2p0 = _mm256_add_epi16( selector1, c_int16_2p0 );
+
+		// c[3,0-15]
+		c_int16_3p0 = _mm256_add_epi16( selector1, c_int16_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x16:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+		// c[2,0-15]
+		c_int16_2p0 = _mm256_max_epi16( selector1, c_int16_2p0 );
+
+		// c[3,0-15]
+		c_int16_3p0 = _mm256_max_epi16( selector1, c_int16_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x16:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		// c[1,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+		// c[2,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_2p0)
+
+		// c[3,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_3p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2,0-15]
+		GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3,0-15]
+		GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		// c[2,0-15]
+		GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+		// c[3,0-15]
+		GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		// c[2,0-15]
+		CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+		// c[3,0-15]
+		CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x16:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		/* Load the scale vector values into the register*/
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (0 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (1 * 8));
+
+		// Scale first 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x16_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0);
+
+		// c[2-3,0-15]
+		CVT_STORE_S16_S8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0 * 16 ) ), c_int16_0p0 );
+
+		// c[1,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0 * 16 ) ), c_int16_1p0 );
+
+		// c[2,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 2 ) + ( 0 * 16 ) ), c_int16_2p0 );
+
+		// c[3,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 3 ) + ( 0 * 16 ) ), c_int16_3p0 );
+	}
+}
+
+// 4x16 int8o16 kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_4xlt16)
+{
+	dim_t NR = 16;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_4xlt16_DISABLE,
+			&&POST_OPS_BIAS_4xlt16,
+			&&POST_OPS_RELU_4xlt16,
+			&&POST_OPS_RELU_SCALE_4xlt16,
+			&&POST_OPS_GELU_TANH_4xlt16,
+			&&POST_OPS_GELU_ERF_4xlt16,
+			&&POST_OPS_CLIP_4xlt16,
+			&&POST_OPS_DOWNSCALE_4xlt16
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec;
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	int16_t buf0[16];
+	int16_t buf1[16];
+	int16_t buf2[16];
+	int16_t buf3[16];
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+
+	__m256i c_int16_1p0 = _mm256_setzero_si256();
+
+	__m256i c_int16_2p0 = _mm256_setzero_si256();
+
+	__m256i c_int16_3p0 = _mm256_setzero_si256();
+
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		dim_t offset = kr * 2;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+		// Broadcast a[1,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+		// Broadcast a[2,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 2) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+		// Broadcast a[3,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 3) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+	}
+
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * k_full_pieces) + (NR * 0)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+		a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+		a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+		a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+        c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+        c_int16_2p0 = _mm256_sub_epi16( c_int16_2p0 , b0 );
+        c_int16_3p0 = _mm256_sub_epi16( c_int16_3p0 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+
+		c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+
+		c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes);
+
+			// c[0,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+			// c[1,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+
+			// c[2,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2)
+
+			// c[3,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2)
+		}
+		else
+		{
+			dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+			memcpy( buf0, ( c + ( rs_c * 0 ) ), n0_rem_bytes );
+			memcpy( buf1, ( c + ( rs_c * 1 ) ), n0_rem_bytes );
+			memcpy( buf2, ( c + ( rs_c * 2 ) ), n0_rem_bytes );
+			memcpy( buf3, ( c + ( rs_c * 3 ) ), n0_rem_bytes );
+
+			// c[0,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+
+			// c[2,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2)
+
+			// c[3,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2)
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4xlt16:
+	{
+		memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1
+			+ post_ops_attr.post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
+
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *) buf0 );
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+		// c[2,0-15]
+		c_int16_2p0 = _mm256_add_epi16( selector1, c_int16_2p0 );
+
+		// c[3,0-15]
+		c_int16_3p0 = _mm256_add_epi16( selector1, c_int16_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4xlt16:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+		// c[2,0-15]
+		c_int16_2p0 = _mm256_max_epi16( selector1, c_int16_2p0 );
+
+		// c[3,0-15]
+		c_int16_3p0 = _mm256_max_epi16( selector1, c_int16_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4xlt16:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		// c[1,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+		// c[2,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_2p0)
+
+		// c[3,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_3p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4xlt16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2,0-15]
+		GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3,0-15]
+		GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4xlt16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		// c[2,0-15]
+		GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+		// c[3,0-15]
+		GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4xlt16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		// c[2,0-15]
+		CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+		// c[3,0-15]
+		CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4xlt16:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		float float_buf[16];
+
+		memcpy( float_buf, ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+
+		// Load the scale vector values into the register
+		scale_1 = _mm256_loadu_ps(float_buf + (0 * 8));
+		scale_2 = _mm256_loadu_ps(float_buf + (1 * 8));
+
+		// Scale first 16 columns of the 6 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4xlt16_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1);
+
+		// c[2-3,0-15]
+		CVT_STORE_S16_S8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3);
+
+		dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes);
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf0, c_int16_0p0 );
+
+		// c[1,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf1, c_int16_1p0 );
+
+		// c[2,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf2, c_int16_2p0 );
+
+		// c[3,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf3, c_int16_3p0 );
+
+		dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+		memcpy( c + ( rs_c * 0 ) + ( 0 * 16 ), buf0, n0_rem_bytes );
+
+		// c[1,0-15]
+		memcpy( c + ( rs_c * 1 ) + ( 0 * 16 ), buf1, n0_rem_bytes );
+
+		// c[2,0-15]
+		memcpy( c + ( rs_c * 2 ) + ( 0 * 16 ), buf2, n0_rem_bytes );
+
+		// c[3,0-15]
+		memcpy( c + ( rs_c * 3 ) + ( 0 * 16 ), buf3, n0_rem_bytes );
+	}
+}
+
+// 2x16 int8o16 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2x16)
+{
+	dim_t NR = 16;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_2x16_DISABLE,
+			&&POST_OPS_BIAS_2x16,
+			&&POST_OPS_RELU_2x16,
+			&&POST_OPS_RELU_SCALE_2x16,
+			&&POST_OPS_GELU_TANH_2x16,
+			&&POST_OPS_GELU_ERF_2x16,
+			&&POST_OPS_CLIP_2x16,	
+			&&POST_OPS_DOWNSCALE_2x16
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec;
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+
+	__m256i c_int16_1p0 = _mm256_setzero_si256();
+
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		dim_t offset = kr * 2;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+		// Broadcast a[1,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+	}
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * k_full_pieces) + (NR * 0)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+		a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+        c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[1,0-15]
+			S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x16:
+	{
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x16:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x16:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		// c[1,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x16:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		/* Load the scale vector values into the register*/
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (0 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (1 * 8));
+
+		// Scale first 16 columns of the 2 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x16_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0 * 16 ) ), c_int16_0p0 );
+
+		// c[1,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0 * 16 ) ), c_int16_1p0 );
+	}
+}
+
+// 2xlt16 int8o16 kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_2xlt16)
+{
+	dim_t NR = 16;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_2xlt16_DISABLE,
+			&&POST_OPS_BIAS_2xlt16,
+			&&POST_OPS_RELU_2xlt16,
+			&&POST_OPS_RELU_SCALE_2xlt16,
+			&&POST_OPS_GELU_TANH_2xlt16,
+			&&POST_OPS_GELU_ERF_2xlt16,
+			&&POST_OPS_CLIP_2xlt16,
+			&&POST_OPS_DOWNSCALE_2xlt16
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec;
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	int16_t buf0[16];
+	int16_t buf1[16];
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+
+	__m256i c_int16_1p0 = _mm256_setzero_si256();
+
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		dim_t offset = kr * 2;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+		// Broadcast a[1,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+	}
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * k_full_pieces) + (NR * 0)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+		a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+        c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+
+			// c[0,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+			// c[1,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+		}
+		else
+		{
+			dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+			memcpy( buf0, ( c + ( rs_c * 0 ) ), n0_rem_bytes );
+			memcpy( buf1, ( c + ( rs_c * 1 ) ), n0_rem_bytes );
+
+			// c[0,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2xlt16:
+	{
+		memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1 +
+			post_ops_attr.post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
+
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *) buf0);
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2xlt16:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		// c[1,0-15]
+		c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2xlt16:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		// c[1,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2xlt16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2xlt16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2xlt16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2xlt16:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		float float_buf[16];
+
+		memcpy( float_buf, ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+
+		// Load the scale vector values into the register
+		scale_1 = _mm256_loadu_ps(float_buf + (0 * 8));
+		scale_2 = _mm256_loadu_ps(float_buf + (1 * 8));
+
+		// Scale first 16 columns of the 6 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2xlt16_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1);
+
+		dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf0, c_int16_0p0 );
+
+		// c[1,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf1, c_int16_1p0 );
+
+		dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+		memcpy( c + ( rs_c * 0 ) + ( 0 * 16 ), buf0, n0_rem_bytes );
+
+		// c[1,0-15]
+		memcpy( c + ( rs_c * 1 ) + ( 0 * 16 ), buf1, n0_rem_bytes );
+	}
+}
+
+// 1x16 int8o16 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1x16)
+{
+	int NR = 16;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_1x16_DISABLE,
+			&&POST_OPS_BIAS_1x16,
+			&&POST_OPS_RELU_1x16,
+			&&POST_OPS_RELU_SCALE_1x16,
+			&&POST_OPS_GELU_TANH_1x16,
+			&&POST_OPS_GELU_ERF_1x16,
+			&&POST_OPS_CLIP_1x16,
+			&&POST_OPS_DOWNSCALE_1x16
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	int k_full_pieces = k0 / 2;
+	int k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec;
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+
+	for (int kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		int offset = kr * 2;
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+	}
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * k_full_pieces) + (NR * 0)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x16:
+	{
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x16:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x16:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x16:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		/* Load the scale vector values into the register*/
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (0 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (1 * 8));
+
+		// Scale first 16 columns of the 2 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x16_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+		__m256i zero_reg = _mm256_setzero_si256();
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_1ROW(c_int16_0p0, zero_reg, 0, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0 * 16 ) ), c_int16_0p0 );
+	}
+}
+
+// 1xlt16 int8o16 kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_1xlt16)
+{
+	int NR = 16;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_1xlt16_DISABLE,
+			&&POST_OPS_BIAS_1xlt16,
+			&&POST_OPS_RELU_1xlt16,
+			&&POST_OPS_RELU_SCALE_1xlt16,
+			&&POST_OPS_GELU_TANH_1xlt16,
+			&&POST_OPS_GELU_ERF_1xlt16,
+			&&POST_OPS_CLIP_1xlt16,
+			&&POST_OPS_DOWNSCALE_1xlt16
+		};
+
+	// The division is done by considering the vpmaddubsw instruction
+	int k_full_pieces = k0 / 2;
+	int k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec;
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	int16_t buf0[16];
+
+	//  Registers to use for accumulating C.
+	__m256i c_int16_0p0 = _mm256_setzero_si256();
+
+	for (int kr = 0; kr < k_full_pieces; kr += 1)
+	{
+		int offset = kr * 2;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
+
+		// Broadcast a[0,kr:kr+2].
+		a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+	}
+	// Handle k remainder.
+	if (k_partial_pieces > 0)
+	{
+		int8_t a_kfringe;
+
+		b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * k_full_pieces) + (NR * 0)));
+
+		a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+		a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+        //convert signed int8 to uint8 for u8s8s16 FMA ops
+        a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+		// Seperate register for intermediate op
+		inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+	}
+    if ( post_ops_attr.is_last_k == 1 )
+    {
+        //Subtract B matrix sum column values to compensate 
+        //for addition of 128 to A matrix elements
+
+        int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+        __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+        c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+    }
+
+	// Load alpha and beta
+	__m256i selector1 = _mm256_set1_epi16(alpha);
+	__m256i selector2 = _mm256_set1_epi16(beta);
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	}
+
+	// Scale C by beta.
+	if (beta != 0)
+	{
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+
+			// c[0,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+		}
+		else
+		{
+			dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+			memcpy( buf0, ( c + ( rs_c * 0 ) ), n0_rem_bytes );
+
+			// c[0,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1xlt16:
+	{
+		memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1
+			+ post_ops_attr.post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
+
+		selector1 =
+			_mm256_loadu_si256( (__m256i const *)buf0 );
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1xlt16:
+	{
+		selector1 = _mm256_setzero_si256 ();
+
+		// c[0,0-15]
+		c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1xlt16:
+	{
+		selector2 =
+			_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+		// c[0,0-15]
+		RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1xlt16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1xlt16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1xlt16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1xlt16:
+	{
+		__m128i temp[2];
+		__m256i temp_32[2];
+		__m256 temp_float[2];
+		__m256 scale_1, scale_2;
+		__m256 res_1, res_2;
+
+		float float_buf[16];
+
+		memcpy( float_buf, ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+
+		// Load the scale vector values into the register
+		scale_1 = _mm256_loadu_ps(float_buf + (0 * 8));
+		scale_2 = _mm256_loadu_ps(float_buf + (1 * 8));
+
+		// Scale first 16 columns of the 2 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1xlt16_DISABLE:
+	;
+
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+		__m256i zero_reg = _mm256_setzero_si256();
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_1ROW_NLT16(c_int16_0p0, zero_reg, buf0);
+
+		dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf0, c_int16_0p0 );
+
+		dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+		memcpy( c + ( rs_c * 0 ) + ( 0 * 16 ), buf0, n0_rem_bytes );
+	}
+}
+#endif
diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c
new file mode 100644
index 0000000000..69b7a9baa9
--- /dev/null
+++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_n_fringe_amd256.c
@@ -0,0 +1,1273 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "../u8s8s16/lpgemm_s16_kern_macros.h"
+
+// 6x16 int8o16 kernel
+LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6x16)
+{
+	dim_t MR = 6;
+	dim_t NR = 16;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_6x16_DISABLE,
+			&&POST_OPS_BIAS_6x16,
+			&&POST_OPS_RELU_6x16,
+			&&POST_OPS_RELU_SCALE_6x16,
+			&&POST_OPS_GELU_TANH_6x16,
+			&&POST_OPS_GELU_ERF_6x16,
+			&&POST_OPS_CLIP_6x16,
+			&&POST_OPS_DOWNSCALE_6x16
+		};
+
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	// B matrix storage.
+	__m256i b0;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec;
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	for (dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR)
+	{
+		//  Registers to use for accumulating C.
+		__m256i c_int16_0p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_1p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_2p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_3p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_4p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_5p0 = _mm256_setzero_si256();
+
+		for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+		{
+			int offset = kr * 2;
+
+			b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (NR * 0)));
+
+			// Broadcast a[0,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+			// Broadcast a[1,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+			// Broadcast a[2,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 2) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+			// Broadcast a[3,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 3) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+
+			// Broadcast a[4,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 4) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_4p0 = _mm256_add_epi16(inter_vec, c_int16_4p0);
+
+			// Broadcast a[5,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 5) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_5p0 = _mm256_add_epi16(inter_vec, c_int16_5p0);
+		}
+
+		// Handle k remainder.
+		if (k_partial_pieces > 0)
+		{
+			int8_t a_kfringe;
+
+			b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * k_full_pieces) + (NR * 0)));
+
+			a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+			a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+			a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+			a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);\
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+
+			a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_4p0 = _mm256_add_epi16(inter_vec, c_int16_4p0);
+
+			a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_5p0 = _mm256_add_epi16(inter_vec, c_int16_5p0);
+		}
+
+        if ( post_ops_attr.is_last_k == 1 )
+		{
+            //Subtract B matrix sum column values to compensate 
+			//for addition of 128 to A matrix elements
+
+            int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+            __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+            c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+			c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+			c_int16_2p0 = _mm256_sub_epi16( c_int16_2p0 , b0 );
+			c_int16_3p0 = _mm256_sub_epi16( c_int16_3p0 , b0 );
+			c_int16_4p0 = _mm256_sub_epi16( c_int16_4p0 , b0 );
+			c_int16_5p0 = _mm256_sub_epi16( c_int16_5p0 , b0 );
+        }
+
+		// Load alpha and beta
+		__m256i selector1 = _mm256_set1_epi16(alpha);
+		__m256i selector2 = _mm256_set1_epi16(beta);
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+
+			c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+
+			c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+
+			c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+
+			c_int16_4p0 = _mm256_mullo_epi16(selector1, c_int16_4p0);
+
+			c_int16_5p0 = _mm256_mullo_epi16(selector1, c_int16_5p0);
+		}
+
+		// Scale C by beta.
+		if (beta != 0)
+		{
+			// For the downscaled api (C-s8), the output C matrix values
+			// needs to be upscaled to s16 to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				S8_S16_BETA_OP(c_int16_0p0,ir,0,0,selector1,selector2)
+
+				// c[1,0-15]
+				S8_S16_BETA_OP(c_int16_1p0,ir,1,0,selector1,selector2)
+
+				// c[2,0-15]
+				S8_S16_BETA_OP(c_int16_2p0,ir,2,0,selector1,selector2)
+
+				// c[3,0-15]
+				S8_S16_BETA_OP(c_int16_3p0,ir,3,0,selector1,selector2)
+
+				// c[4,0-15]
+				S8_S16_BETA_OP(c_int16_4p0,ir,4,0,selector1,selector2)
+
+				// c[5,0-15]
+				S8_S16_BETA_OP(c_int16_5p0,ir,5,0,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				S16_S16_BETA_OP(c_int16_0p0,ir,0,0,selector1,selector2)
+
+				// c[1,0-15]
+				S16_S16_BETA_OP(c_int16_1p0,ir,1,0,selector1,selector2)
+
+				// c[2,0-15]
+				S16_S16_BETA_OP(c_int16_2p0,ir,2,0,selector1,selector2)
+
+				// c[3,0-15]
+				S16_S16_BETA_OP(c_int16_3p0,ir,3,0,selector1,selector2)
+
+				// c[4,0-15]
+				S16_S16_BETA_OP(c_int16_4p0,ir,4,0,selector1,selector2)
+
+				// c[5,0-15]
+				S16_S16_BETA_OP(c_int16_5p0,ir,5,0,selector1,selector2)
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x16:
+		{
+			selector1 =
+				_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+			
+			// c[0,0-15]
+			c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+			// c[1,0-15]
+			c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+			// c[2,0-15]
+			c_int16_2p0 = _mm256_add_epi16( selector1, c_int16_2p0 );
+
+			// c[3,0-15]
+			c_int16_3p0 = _mm256_add_epi16( selector1, c_int16_3p0 );
+
+			// c[4,0-15]
+			c_int16_4p0 = _mm256_add_epi16( selector1, c_int16_4p0 );
+
+			// c[5,0-15]
+			c_int16_5p0 = _mm256_add_epi16( selector1, c_int16_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x16:
+		{
+			selector1 = _mm256_setzero_si256 ();
+
+			// c[0,0-15]
+			c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+			// c[1,0-15]
+			c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+			// c[2,0-15]
+			c_int16_2p0 = _mm256_max_epi16( selector1, c_int16_2p0 );
+
+			// c[3,0-15]
+			c_int16_3p0 = _mm256_max_epi16( selector1, c_int16_3p0 );
+
+			// c[4,0-15]
+			c_int16_4p0 = _mm256_max_epi16( selector1, c_int16_4p0 );
+
+			// c[5,0-15]
+			c_int16_5p0 = _mm256_max_epi16( selector1, c_int16_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x16:
+		{
+			selector2 =
+				_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+			// c[0,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+			// c[1,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+			// c[2,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_2p0)
+
+			// c[3,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_3p0)
+
+			// c[4,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_4p0)
+
+			// c[5,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_5p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x16:
+		{
+			__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+			__m256i q;
+
+			// c[0,0-15]
+			GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1,0-15]
+			GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2,0-15]
+			GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3,0-15]
+			GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4,0-15]
+			GELU_TANH_S16_AVX2(c_int16_4p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5,0-15]
+			GELU_TANH_S16_AVX2(c_int16_5p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x16:
+		{
+			__m256 x, r, y1, y2, x_erf;
+
+			// c[0,0-15]
+			GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+			// c[1,0-15]
+			GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+			// c[2,0-15]
+			GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+			// c[3,0-15]
+			GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+			// c[4,0-15]
+			GELU_ERF_S16_AVX2(c_int16_4p0, y1, y2, r, x, x_erf)
+
+			// c[5,0-15]
+			GELU_ERF_S16_AVX2(c_int16_5p0, y1, y2, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x16:
+		{
+			__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+			__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+			// c[0,0-15]
+			CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+			// c[1,0-15]
+			CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+			// c[2,0-15]
+			CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+			// c[3,0-15]
+			CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+			// c[4,0-15]
+			CLIP_S16_AVX2(c_int16_4p0, min, max)
+
+			// c[5,0-15]
+			CLIP_S16_AVX2(c_int16_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x16:
+		{
+			__m128i temp[2];
+			__m256i temp_32[2];
+			__m256 temp_float[2];
+			__m256 scale_1, scale_2;
+			__m256 res_1, res_2;
+
+			/* Load the scale vector values into the register*/
+			scale_1 =
+				_mm256_loadu_ps(
+				(float *)post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j + (0 * 8));
+			scale_2 =
+				_mm256_loadu_ps(
+				(float *)post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j + (1 * 8));
+
+			// Scale first 16 columns of the 6 rows.
+			CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_6x16_DISABLE:
+		;
+
+		// Case where the output C matrix is s8 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Store the results in downscaled type (int8 instead of int32).
+			__m128i temp[2];
+
+			// c[0-1,0-15]
+			CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0);
+
+			// c[2-3,0-15]
+			CVT_STORE_S16_S8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0);
+
+			// c[4-5,0-15]
+			CVT_STORE_S16_S8_2ROW(c_int16_4p0, c_int16_5p0, 4, 5, 0);
+		}
+		// Case where the output C matrix is s16 or is the temp buffer used to
+		// store intermediate s16 accumulated values for downscaled (C-s8) api.
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c *  ( ir + 0 ) ) + ( 0 * 16 ) ), c_int16_0p0 );
+
+			// c[1,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 0 * 16 ) ), c_int16_1p0 );
+
+			// c[2,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 0 * 16 ) ), c_int16_2p0 );
+
+			// c[3,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 0 * 16 ) ), c_int16_3p0 );
+
+			// c[4,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 0 * 16 ) ), c_int16_4p0 );
+
+			// c[5,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 0 * 16 ) ), c_int16_5p0 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if (m_partial_pieces > 0)
+	{
+		dim_t m_partial4 = m_partial_pieces / 4;
+		m_partial_pieces = m_partial_pieces % 4;
+
+		dim_t m_partial2 = m_partial_pieces / 2;
+		dim_t m_partial = m_partial_pieces % 2;
+
+		if (m_partial4 == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_4x16(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta,
+				post_ops_list, post_ops_attr);
+
+			// a pointer increment
+			a = a + (4 * ps_a);
+			m_full_pieces_loop_limit += 4;
+			post_ops_attr.post_op_c_i += 4;
+		}
+
+		if (m_partial2 == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_2x16(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta,
+				post_ops_list, post_ops_attr);
+
+			// a pointer increment
+			a = a + (2 * ps_a);
+			m_full_pieces_loop_limit += 2;
+			post_ops_attr.post_op_c_i += 2;
+		}
+
+		if (m_partial == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_1x16(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta,
+				post_ops_list, post_ops_attr);
+			post_ops_attr.post_op_c_i += 1;
+		}
+	}
+}
+
+// 6xlt16 int8o16 kernel
+LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int16_t,s8s8s16o16_6xlt16)
+{
+	dim_t MR = 6;
+
+	static void *post_ops_labels[] =
+		{
+			&&POST_OPS_6xlt16_DISABLE,
+			&&POST_OPS_BIAS_6xlt16,
+			&&POST_OPS_RELU_6xlt16,
+			&&POST_OPS_RELU_SCALE_6xlt16,
+			&&POST_OPS_GELU_TANH_6xlt16,
+			&&POST_OPS_GELU_ERF_6xlt16,
+			&&POST_OPS_CLIP_6xlt16,
+			&&POST_OPS_DOWNSCALE_6xlt16
+		};
+
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	// The division is done by considering the vpmaddubsw instruction
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t buf0[16];
+	int16_t buf1[16];
+	int16_t buf2[16];
+	int16_t buf3[16];
+	int16_t buf4[16];
+	int16_t buf5[16];
+
+	// B matrix storage.
+	__m256i b0;
+
+	// A matrix storage.
+	__m256i a_int32_0;
+	__m256i inter_vec;
+
+    uint8_t cvt_uint8 = 128;
+	__m256i vec_uint8 = _mm256_set1_epi8 (cvt_uint8);
+
+	for (dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR)
+	{
+		//  Registers to use for accumulating C.
+		__m256i c_int16_0p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_1p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_2p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_3p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_4p0 = _mm256_setzero_si256();
+
+		__m256i c_int16_5p0 = _mm256_setzero_si256();
+
+		for (dim_t kr = 0; kr < k_full_pieces; kr += 1)
+		{
+			dim_t offset = kr * 2;
+
+			b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * kr) + (cs_b * 0)));
+
+			// Broadcast a[0,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 0) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+			// Broadcast a[1,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 1) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+			// Broadcast a[2,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 2) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+			// Broadcast a[3,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 3) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+
+			// Broadcast a[4,kr:kr+2].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 4) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+4,0-31]
+			c_int16_4p0 = _mm256_add_epi16(inter_vec, c_int16_4p0);
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_0 = _mm256_set1_epi16(*(int16_t *)(a + (rs_a * 5) + (cs_a * offset)));
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_5p0 = _mm256_add_epi16(inter_vec, c_int16_5p0);
+		}
+
+		// Handle k remainder.
+		if (k_partial_pieces > 0)
+		{
+			int8_t a_kfringe;
+
+			b0 = _mm256_loadu_si256((__m256i const *)(b + (32 * k_full_pieces) + (cs_b * 0)));
+
+			a_kfringe = *(a + (rs_a * 0) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_0p0 = _mm256_add_epi16(inter_vec, c_int16_0p0);
+
+			a_kfringe = *(a + (rs_a * 1) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_1p0 = _mm256_add_epi16(inter_vec, c_int16_1p0);
+
+			a_kfringe = *(a + (rs_a * 2) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_2p0 = _mm256_add_epi16(inter_vec, c_int16_2p0);
+
+			a_kfringe = *(a + (rs_a * 3) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_3p0 = _mm256_add_epi16(inter_vec, c_int16_3p0);
+
+			a_kfringe = *(a + (rs_a * 4) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_4p0 = _mm256_add_epi16(inter_vec, c_int16_4p0);
+
+			a_kfringe = *(a + (rs_a * 5) + (cs_a * (k_full_pieces * 2)));
+			a_int32_0 = _mm256_set1_epi8(a_kfringe);
+
+            //convert signed int8 to uint8 for u8s8s16 FMA ops
+            a_int32_0 = _mm256_add_epi8( a_int32_0, vec_uint8 );
+
+			// Seperate register for intermediate op
+			inter_vec = _mm256_maddubs_epi16(a_int32_0, b0);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-31]
+			c_int16_5p0 = _mm256_add_epi16(inter_vec, c_int16_5p0);
+		}
+
+        if ( post_ops_attr.is_last_k == 1 )
+		{
+            //Subtract B matrix sum column values to compensate 
+			//for addition of 128 to A matrix elements
+
+            int16_t* bsumptr = post_ops_attr.b_col_sum_vec_s16 + post_ops_attr.b_sum_offset;
+
+            __m256i b0 = _mm256_loadu_si256( (__m256i const *)(bsumptr) );
+
+            c_int16_0p0 = _mm256_sub_epi16( c_int16_0p0 , b0 );
+			c_int16_1p0 = _mm256_sub_epi16( c_int16_1p0 , b0 );
+			c_int16_2p0 = _mm256_sub_epi16( c_int16_2p0 , b0 );
+			c_int16_3p0 = _mm256_sub_epi16( c_int16_3p0 , b0 );
+			c_int16_4p0 = _mm256_sub_epi16( c_int16_4p0 , b0 );
+			c_int16_5p0 = _mm256_sub_epi16( c_int16_5p0 , b0 );
+        }
+
+		// Load alpha and beta
+		__m256i selector1 = _mm256_set1_epi16(alpha);
+		__m256i selector2 = _mm256_set1_epi16(beta);
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+
+			c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+
+			c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+
+			c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+
+			c_int16_4p0 = _mm256_mullo_epi16(selector1, c_int16_4p0);
+
+			c_int16_5p0 = _mm256_mullo_epi16(selector1, c_int16_5p0);
+		}
+
+		// Scale C by beta.
+		if (beta != 0)
+		{
+			// For the downscaled api (C-s8), the output C matrix values
+			// needs to be upscaled to s16 to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes);
+
+				// c[0,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+				// c[1,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+
+				// c[2,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2)
+
+				// c[3,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2)
+
+				// c[4,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_4p0,buf4,selector1,selector2)
+
+				// c[5,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_5p0,buf5,selector1,selector2)
+			}
+			else
+			{
+				dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+				memcpy( buf0, ( c + ( rs_c * ( ir + 0 ) ) ), n0_rem_bytes );
+				memcpy( buf1, ( c + ( rs_c * ( ir + 1 ) ) ), n0_rem_bytes );
+				memcpy( buf2, ( c + ( rs_c * ( ir + 2 ) ) ), n0_rem_bytes );
+				memcpy( buf3, ( c + ( rs_c * ( ir + 3 ) ) ), n0_rem_bytes );
+				memcpy( buf4, ( c + ( rs_c * ( ir + 4 ) ) ), n0_rem_bytes );
+				memcpy( buf5, ( c + ( rs_c * ( ir + 5 ) ) ), n0_rem_bytes );
+
+				// c[0,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+				// c[1,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+
+				// c[2,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2)
+
+				// c[3,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2)
+
+				// c[4,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_4p0,buf4,selector1,selector2)
+
+				// c[5,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_5p0,buf5,selector1,selector2)
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6xlt16:
+		{
+			memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) ),
+							( n0_rem * sizeof( int16_t ) ) );
+
+			selector1 =
+				_mm256_loadu_si256( ( __m256i const* )buf0 );
+			
+			// c[0,0-15]
+			c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
+
+			// c[1,0-15]
+			c_int16_1p0 = _mm256_add_epi16( selector1, c_int16_1p0 );
+
+			// c[2,0-15]
+			c_int16_2p0 = _mm256_add_epi16( selector1, c_int16_2p0 );
+
+			// c[3,0-15]
+			c_int16_3p0 = _mm256_add_epi16( selector1, c_int16_3p0 );
+
+			// c[4,0-15]
+			c_int16_4p0 = _mm256_add_epi16( selector1, c_int16_4p0 );
+
+			// c[5,0-15]
+			c_int16_5p0 = _mm256_add_epi16( selector1, c_int16_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6xlt16:
+		{
+			selector1 = _mm256_setzero_si256 ();
+
+			// c[0,0-15]
+			c_int16_0p0 = _mm256_max_epi16( selector1, c_int16_0p0 );
+
+			// c[1,0-15]
+			c_int16_1p0 = _mm256_max_epi16( selector1, c_int16_1p0 );
+
+			// c[2,0-15]
+			c_int16_2p0 = _mm256_max_epi16( selector1, c_int16_2p0 );
+
+			// c[3,0-15]
+			c_int16_3p0 = _mm256_max_epi16( selector1, c_int16_3p0 );
+
+			// c[4,0-15]
+			c_int16_4p0 = _mm256_max_epi16( selector1, c_int16_4p0 );
+
+			// c[5,0-15]
+			c_int16_5p0 = _mm256_max_epi16( selector1, c_int16_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6xlt16:
+		{
+			selector2 =
+				_mm256_set1_epi16( *( ( int16_t* )post_ops_list_temp->op_args2 ) );
+
+			// c[0,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_0p0)
+
+			// c[1,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_1p0)
+
+			// c[2,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_2p0)
+
+			// c[3,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_3p0)
+
+			// c[4,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_4p0)
+
+			// c[5,0-15]
+			RELU_SCALE_OP_S16_AVX2(c_int16_5p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6xlt16:
+		{
+			__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+			__m256i q;
+
+			// c[0,0-15]
+			GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1,0-15]
+			GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2,0-15]
+			GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3,0-15]
+			GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4,0-15]
+			GELU_TANH_S16_AVX2(c_int16_4p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5,0-15]
+			GELU_TANH_S16_AVX2(c_int16_5p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6xlt16:
+		{
+			__m256 x, r, y1, y2, x_erf;
+
+			// c[0,0-15]
+			GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+			// c[1,0-15]
+			GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+			// c[2,0-15]
+			GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+			// c[3,0-15]
+			GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+			// c[4,0-15]
+			GELU_ERF_S16_AVX2(c_int16_4p0, y1, y2, r, x, x_erf)
+
+			// c[5,0-15]
+			GELU_ERF_S16_AVX2(c_int16_5p0, y1, y2, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6xlt16:
+		{
+			__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+			__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+			// c[0,0-15]
+			CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+			// c[1,0-15]
+			CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+			// c[2,0-15]
+			CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+			// c[3,0-15]
+			CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+			// c[4,0-15]
+			CLIP_S16_AVX2(c_int16_4p0, min, max)
+
+			// c[5,0-15]
+			CLIP_S16_AVX2(c_int16_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6xlt16:
+		{
+			__m128i temp[2];
+			__m256i temp_32[2];
+			__m256 temp_float[2];
+			__m256 scale_1, scale_2;
+			__m256 res_1, res_2;
+
+			float float_buf[16];
+
+			memcpy( float_buf, ( ( float* )post_ops_list_temp->scale_factor +
+					post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+
+			// Load the scale vector values into the register
+			scale_1 = _mm256_loadu_ps(float_buf + (0 * 8));
+			scale_2 = _mm256_loadu_ps(float_buf + (1 * 8));
+
+			// Scale first 16 columns of the 6 rows.
+			CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_6xlt16_DISABLE:
+		;
+
+		// Case where the output C matrix is s8 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Store the results in downscaled type (int8 instead of int32).
+			__m128i temp[2];
+
+			// c[0-1,0-15]
+			CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1);
+
+			// c[2-3,0-15]
+			CVT_STORE_S16_S8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3);
+
+			// c[4-5,0-15]
+			CVT_STORE_S16_S8_2ROW_NLT16(c_int16_4p0, c_int16_5p0, buf4, buf5);
+
+			dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes);
+		}
+		// Case where the output C matrix is s16 or is the temp buffer used to
+		// store intermediate s16 accumulated values for downscaled (C-s8) api.
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf0, c_int16_0p0 );
+
+			// c[1,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf1, c_int16_1p0 );
+
+			// c[2,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf2, c_int16_2p0 );
+
+			// c[3,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf3, c_int16_3p0 );
+
+			// c[4,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf4, c_int16_4p0 );
+
+			// c[5,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf5, c_int16_5p0 );
+
+			dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+			memcpy( c + ( rs_c * ( ir + 0 ) ) + ( 0 * 16 ), buf0, n0_rem_bytes );
+
+			// c[1,0-15]
+			memcpy( c + ( rs_c * ( ir + 1 ) ) + ( 0 * 16 ), buf1, n0_rem_bytes );
+
+			// c[2,0-15]
+			memcpy( c + ( rs_c * ( ir + 2 ) ) + ( 0 * 16 ), buf2, n0_rem_bytes );
+
+			// c[3,0-15]
+			memcpy( c + ( rs_c * ( ir + 3 ) ) + ( 0 * 16 ), buf3, n0_rem_bytes );
+
+			// c[4,0-15]
+			memcpy( c + ( rs_c * ( ir + 4 ) ) + ( 0 * 16 ), buf4, n0_rem_bytes );
+
+			// c[5,0-15]
+			memcpy( c + ( rs_c * ( ir + 5 ) ) + ( 0 * 16 ), buf5, n0_rem_bytes );
+		}
+
+		a = a + (MR * ps_a);
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if (m_partial_pieces > 0)
+	{
+		dim_t m_partial4 = m_partial_pieces / 4;
+		m_partial_pieces = m_partial_pieces % 4;
+
+		dim_t m_partial2 = m_partial_pieces / 2;
+		dim_t m_partial = m_partial_pieces % 2;
+
+		if (m_partial4 == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_4xlt16(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta, n0_rem,
+				post_ops_list, post_ops_attr);
+
+			// a pointer increment
+			a = a + (4 * ps_a);
+			m_full_pieces_loop_limit += 4;
+			post_ops_attr.post_op_c_i += 4;
+		}
+
+		if (m_partial2 == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_2xlt16(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta, n0_rem,
+				post_ops_list, post_ops_attr);
+
+			// a pointer increment
+			a = a + (2 * ps_a);
+			m_full_pieces_loop_limit += 2;
+			post_ops_attr.post_op_c_i += 2;
+		}
+
+		if (m_partial == 1)
+		{
+			lpgemm_rowvar_s8s8s16o16_1xlt16(
+				k0,
+				a, rs_a, cs_a,
+				b, rs_b, cs_b,
+				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
+				alpha, beta, n0_rem,
+				post_ops_list, post_ops_attr);
+			post_ops_attr.post_op_c_i += 1;
+		}
+	}
+}
+#endif
diff --git a/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_packb_amd256.c b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_packb_amd256.c
new file mode 100644
index 0000000000..5fa9879a51
--- /dev/null
+++ b/kernels/zen/lpgemm/s8s8s16/lpgemm_s8_packb_amd256.c
@@ -0,0 +1,412 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+void packb_nrlt16_s8s8s16o16
+	(
+		int8_t *pack_b_buffer_s8s8s16o16,
+		int16_t *pack_b_column_sum,
+		const int8_t *b,
+		const dim_t ldb,
+		const dim_t rows,
+		dim_t n0_partial_rem
+	)
+{
+	dim_t k_full_pieces_blks = rows / 2;
+	dim_t k_full_pieces = k_full_pieces_blks * 2;
+	dim_t k_partial_pieces = rows % 2;
+	dim_t NR = 16;
+	dim_t kr_new = 0;
+
+	int8_t buf0[16], buf1[16];
+
+	__m128i b_vec[2], inter_vec[2];
+
+	__m256i sum1;
+	__m256i temp1;
+	__m256 temp2, temp3;
+
+	//load the temp buffer to compute column sum of B matrix
+    sum1 = _mm256_loadu_si256( (__m256i const *)(pack_b_column_sum) );
+	
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 2)
+	{
+		memcpy(buf0, (b + (ldb * (kr + 0))), (n0_partial_rem * sizeof(int8_t)));
+		memcpy(buf1, (b + (ldb * (kr + 1))), (n0_partial_rem * sizeof(int8_t)));
+
+		// Read b[0,0], b[0,1], b[0,2]......., b[0,15]
+		b_vec[0] = _mm_loadu_si128((__m128i *)buf0);
+		// Read b[1,0], b[1,1], b[1,2]......., b[1,15]
+		b_vec[1] = _mm_loadu_si128((__m128i *)buf1);
+
+		//compute sum1 to compute B matrix column sum
+		temp1 =  
+            _mm256_add_epi16( _mm256_cvtepi8_epi16( b_vec[0] ), _mm256_cvtepi8_epi16( b_vec[1] ));
+
+		temp2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 0)));		
+		temp2 = _mm256_mul_ps(temp2, _mm256_set1_ps (128));
+		
+		temp3 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 1)));
+		temp3 = _mm256_mul_ps(temp3, _mm256_set1_ps (128));	
+		
+		temp1 = _mm256_packs_epi32(_mm256_cvtps_epi32(temp2), _mm256_cvtps_epi32(temp3));
+		temp1 = _mm256_permute4x64_epi64(temp1, 0XD8);
+
+		sum1 = _mm256_add_epi16 (sum1, temp1);
+
+		// Reorder B matrix inputs to suit vpmaddubsw instructions
+		inter_vec[0] = _mm_unpacklo_epi8(b_vec[0], b_vec[1]);
+		inter_vec[1] = _mm_unpackhi_epi8(b_vec[0], b_vec[1]);
+
+		// Store b[0,0], b[1,0], b[0,1]......., b[0,7], b[1,7]
+		_mm_storeu_si128((__m128i *)(pack_b_buffer_s8s8s16o16 + (kr_new * NR)), inter_vec[0]);
+		// Store b[0,8], b[1,8], b[0,9]......., b[0,15], b[1,15]
+		_mm_storeu_si128((__m128i *)(pack_b_buffer_s8s8s16o16 + ((kr_new + 1) * NR)), inter_vec[1]);
+
+		// Increment to ignore the padded bits
+		kr_new += 2;
+	}
+
+	// Handle k partial cases
+	if (k_partial_pieces > 0)
+	{
+		memcpy(buf0, (b + (ldb * (k_full_pieces + 0))), (n0_partial_rem * sizeof(int8_t)));
+
+		// Read b[0,0], b[0,1], b[0,2]......., b[0,15]
+		b_vec[0] = _mm_loadu_si128((__m128i *)buf0);
+		b_vec[1] = _mm_setzero_si128(); // Initialize with zero for padding
+
+		//compute sum1 to compute B matrix column sum
+		temp1 =  ( _mm256_cvtepi8_epi16( b_vec[0] ));
+
+		temp2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 0)));		
+		temp2 = _mm256_mul_ps(temp2, _mm256_set1_ps (128));
+		
+		temp3 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 1)));
+		temp3 = _mm256_mul_ps(temp3, _mm256_set1_ps (128));	
+		
+		temp1 = _mm256_packs_epi32(_mm256_cvtps_epi32(temp2), _mm256_cvtps_epi32(temp3));
+		temp1 = _mm256_permute4x64_epi64(temp1, 0XD8);
+
+		sum1 = _mm256_add_epi16 (sum1, temp1);
+
+		// Reorder B matrix inputs to suit vpmaddubsw instructions
+		inter_vec[0] = _mm_unpacklo_epi8(b_vec[0], b_vec[1]);
+		inter_vec[1] = _mm_unpackhi_epi8(b_vec[0], b_vec[1]);
+
+		// Store b[0,0], 0, b[0,1]......., b[0,7], 0
+		_mm_storeu_si128((__m128i *)(pack_b_buffer_s8s8s16o16 + ((kr_new + 0) * NR)), inter_vec[0]);
+
+		// Store b[0,8], 0, b[0,9]......., b[0,15], 0
+		_mm_storeu_si128((__m128i *)(pack_b_buffer_s8s8s16o16 + ((kr_new + 1) * NR)), inter_vec[1]);
+	}
+	//store the sum column
+	_mm256_storeu_si256( (__m256i *)(pack_b_column_sum), sum1 );
+}
+
+void packb_nr16_s8s8s16o16(
+	int8_t *pack_b_buffer_s8s8s16o16,
+	int16_t *pack_b_column_sum,
+	const int8_t *b,
+	const dim_t ldb,
+	const dim_t rows)
+{
+	dim_t k_full_pieces_blks = rows / 2;
+	dim_t k_full_pieces = k_full_pieces_blks * 2;
+	dim_t k_partial_pieces = rows % 2;
+	dim_t NR = 16;
+	dim_t kr_new = 0;
+
+	__m128i b_vec[2], inter_vec[2];
+
+	__m256i sum1;
+	__m256i temp1;
+	__m256 temp2, temp3;
+
+	//load the temp buffer to compute column sum of B matrix
+    sum1 = _mm256_loadu_si256( (__m256i const *)(pack_b_column_sum) );
+
+	for (dim_t kr = 0; kr < k_full_pieces; kr += 2)
+	{
+		// Read b[0,0], b[0,1], b[0,2]......., b[0,15]
+		b_vec[0] = _mm_loadu_si128((__m128i const *)(b + (ldb * (kr + 0))));
+
+		// Read b[1,0], b[1,1], b[1,2]......., b[1,15]
+		b_vec[1] = _mm_loadu_si128((__m128i const *)(b + (ldb * (kr + 1))));
+
+		//compute sum1 to compute B matrix column sum
+		temp1 =  
+            _mm256_add_epi16( _mm256_cvtepi8_epi16( b_vec[0] ), _mm256_cvtepi8_epi16( b_vec[1] ));
+
+		temp2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 0)));		
+		temp2 = _mm256_mul_ps(temp2, _mm256_set1_ps (128));
+		
+		temp3 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 1)));
+		temp3 = _mm256_mul_ps(temp3, _mm256_set1_ps (128));	
+		
+		temp1 = _mm256_packs_epi32(_mm256_cvtps_epi32(temp2), _mm256_cvtps_epi32(temp3));
+		temp1 = _mm256_permute4x64_epi64(temp1, 0XD8);
+
+		sum1 = _mm256_add_epi16 (sum1, temp1);
+
+		// Reorder B matrix inputs to suit vpmaddubsw instructions
+		inter_vec[0] = _mm_unpacklo_epi8(b_vec[0], b_vec[1]);
+		inter_vec[1] = _mm_unpackhi_epi8(b_vec[0], b_vec[1]);
+
+		// Store b[0,0], b[1,0], b[0,1]......., b[0,7], b[1,7]
+		_mm_storeu_si128((__m128i *)(pack_b_buffer_s8s8s16o16 + ((kr_new + 0) * NR)), inter_vec[0]);
+
+		// Store b[0,8], b[1,8], b[0,9]......., b[0,15], b[1,15]
+		_mm_storeu_si128((__m128i *)(pack_b_buffer_s8s8s16o16 + ((kr_new + 1) * NR)), inter_vec[1]);
+
+		// Increment to ignore the padded bits
+		kr_new += 2;
+	}
+
+	if (k_partial_pieces > 0)
+	{
+		// Read b[0,0], b[0,1], b[0,2]......., b[0,15]
+		b_vec[0] = _mm_loadu_si128((__m128i const *)(b + (ldb * (k_full_pieces + 0))));
+		b_vec[1] = _mm_setzero_si128(); // Initialize with zero for padding
+
+		//compute sum1
+		temp1 =  ( _mm256_cvtepi8_epi16( b_vec[0] ));
+
+		temp2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 0)));		
+		temp2 = _mm256_mul_ps(temp2, _mm256_set1_ps (128));
+		
+		temp3 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 1)));
+		temp3 = _mm256_mul_ps(temp3, _mm256_set1_ps (128));	
+		
+		temp1 = _mm256_packs_epi32(_mm256_cvtps_epi32(temp2), _mm256_cvtps_epi32(temp3));
+		temp1 = _mm256_permute4x64_epi64(temp1, 0XD8);
+
+		sum1 = _mm256_add_epi16 (sum1, temp1);
+
+		// Reorder B matrix inputs to suit vpmaddubsw instructions
+		inter_vec[0] = _mm_unpacklo_epi8(b_vec[0], b_vec[1]);
+		inter_vec[1] = _mm_unpackhi_epi8(b_vec[0], b_vec[1]);
+
+		// Store b[0,0], 0, b[0,1]......., b[0,7], 0
+		_mm_storeu_si128((__m128i *)(pack_b_buffer_s8s8s16o16 + ((kr_new + 0) * NR)), inter_vec[0]);
+		// Store b[0,8], 0, b[0,9]......., b[0,15], 0
+		_mm_storeu_si128((__m128i *)(pack_b_buffer_s8s8s16o16 + ((kr_new + 1) * NR)), inter_vec[1]);
+	}
+	//store the sum column
+	_mm256_storeu_si256( (__m256i *)(pack_b_column_sum), sum1 );
+}
+
+void packb_nr32_s8s8s16o16(
+	int8_t  *pack_b_buffer_s8s8s16o16,
+    int16_t *pack_b_column_sum,
+	const int8_t *b,
+	const dim_t ldb,
+	const dim_t cols,
+	const dim_t rows,
+	dim_t *rs_b,
+	dim_t *cs_b)
+{
+	dim_t NR = 32;
+
+	dim_t n_full_pieces = cols / NR;
+	dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
+	dim_t n_partial_pieces = cols % NR;
+	dim_t k_full_pieces_blks = rows / 2;
+	dim_t k_full_pieces = k_full_pieces_blks * 2;
+	dim_t k_partial_pieces = rows % 2;
+
+	dim_t KC_updated = rows;
+
+	// Making multiple of 2 to suit k in vpmaddubsw
+	KC_updated += (KC_updated & 0x1);
+
+    //to compute column sum of B matrix
+    __m256i sum1, sum2;
+	__m256i temp1;
+	__m256 temp2, temp3;
+
+	__m256i b_vec[2], inter_vec[2];
+
+	for (dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR)
+	{
+        //load the temp buffer to compute column sum of B matrix
+        sum1 = _mm256_loadu_si256( (__m256i const *)(pack_b_column_sum + jc) );
+        sum2 = _mm256_loadu_si256( (__m256i const *)(pack_b_column_sum + 16 + jc) );
+
+		for (dim_t kr = 0; kr < k_full_pieces; kr += 2)
+		{
+			// Read b[0,0], b[0,1], b[0,2]......., b[0,31]
+			b_vec[0] = _mm256_loadu_si256((__m256i const *)(b + (ldb * (kr + 0)) + jc));
+
+			//  Read b[1,0], b[1,1], b[1,2]......., b[1,31]
+			b_vec[1] = _mm256_loadu_si256((__m256i const *)(b + (ldb * (kr + 1)) + jc));
+
+            //add all the columns : sum = add (sum, a0, b0)
+            //compute sum1 and sum2 to compute B matrix column sum
+			temp1 =  
+                _mm256_add_epi16( _mm256_cvtepi8_epi16( _mm256_extractf128_si256( b_vec[0], 0 )), 
+                _mm256_cvtepi8_epi16( _mm256_extractf128_si256( b_vec[1], 0 )));
+
+			temp2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 0)));
+			temp2 = _mm256_mul_ps(temp2, _mm256_set1_ps (128));
+
+			temp3 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 1)));
+			temp3 = _mm256_mul_ps(temp3, _mm256_set1_ps (128));	
+	  
+			temp1 = _mm256_packs_epi32(_mm256_cvtps_epi32(temp2), _mm256_cvtps_epi32(temp3));
+			temp1 = _mm256_permute4x64_epi64(temp1, 0XD8);
+
+			sum1 = _mm256_add_epi16 (sum1, temp1);
+
+            //compute sum2
+			temp1 =  
+                _mm256_add_epi16( _mm256_cvtepi8_epi16( _mm256_extractf128_si256( b_vec[0], 1 )), 
+                _mm256_cvtepi8_epi16( _mm256_extractf128_si256( b_vec[1], 1 )));
+
+			temp2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 0)));
+			temp2 = _mm256_mul_ps(temp2, _mm256_set1_ps (128));
+
+			temp3 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 1)));
+			temp3 = _mm256_mul_ps(temp3, _mm256_set1_ps (128));	
+
+			temp1 = _mm256_packs_epi32(_mm256_cvtps_epi32(temp2), _mm256_cvtps_epi32(temp3));
+			temp1 = _mm256_permute4x64_epi64(temp1, 0XD8);
+
+			sum2 = _mm256_add_epi16 (sum2, temp1);
+
+			//  Reorder B matrix inputs to suit vpmaddubsw instructions
+			inter_vec[0] = _mm256_unpacklo_epi8(b_vec[0], b_vec[1]);
+			inter_vec[1] = _mm256_unpackhi_epi8(b_vec[0], b_vec[1]);
+
+			b_vec[0] = _mm256_permute2f128_si256(inter_vec[0], inter_vec[1], 0x20);
+			b_vec[1] = _mm256_permute2f128_si256(inter_vec[0], inter_vec[1], 0x31);
+
+			// Store B[0,0], B[1,0], B[0,1], B[1,1], ......, B[0,15], B[1,15]
+			_mm256_storeu_si256((__m256i *)(pack_b_buffer_s8s8s16o16 + ((jc * KC_updated) + (kr * NR))), b_vec[0]);
+			// Store B[0,16], B[1,16], B[0,17], B[1,17], ......, B[0,31], B[1,31]
+			_mm256_storeu_si256((__m256i *)(pack_b_buffer_s8s8s16o16 + ((jc * KC_updated) + ((kr + 1) * NR))), b_vec[1]);
+		}
+
+		if (k_partial_pieces > 0)
+		{
+			// Read b[0,0], b[0,1], b[0,2]......., b[0,31]
+			b_vec[0] = _mm256_loadu_si256((__m256i const *)(b + (ldb * (k_full_pieces + 0)) + jc));
+			b_vec[1] = _mm256_setzero_si256(); // Initialize with zero for padding
+
+			//compute sum1
+			temp1 =  _mm256_cvtepi8_epi16( _mm256_extractf128_si256( b_vec[0], 0 ));
+			
+			temp2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 0)));
+			temp2 = _mm256_mul_ps(temp2, _mm256_set1_ps (128));
+
+			temp3 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 1)));
+			temp3 = _mm256_mul_ps(temp3, _mm256_set1_ps (128));
+
+			temp1 = _mm256_packs_epi32(_mm256_cvtps_epi32(temp2), _mm256_cvtps_epi32(temp3));
+			temp1 = _mm256_permute4x64_epi64(temp1, 0XD8);
+
+			sum1 = _mm256_add_epi16 (sum1, temp1);	
+
+            //compute sum2
+			temp1 =  _mm256_cvtepi8_epi16( _mm256_extractf128_si256( b_vec[0], 1 ));
+
+			temp2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 0)));
+			temp2 = _mm256_mul_ps(temp2, _mm256_set1_ps (128));
+
+			temp3 = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extractf128_si256(temp1, 1)));
+			temp3 = _mm256_mul_ps(temp3, _mm256_set1_ps (128));	
+
+			temp1 = _mm256_packs_epi32(_mm256_cvtps_epi32(temp2), _mm256_cvtps_epi32(temp3));
+			temp1 = _mm256_permute4x64_epi64(temp1, 0XD8);
+
+			sum2 = _mm256_add_epi16 (sum2, temp1);
+
+			// Reorder B matrix inputs to suit vpmaddubsw instructions
+			inter_vec[0] = _mm256_unpacklo_epi8(b_vec[0], b_vec[1]);
+			inter_vec[1] = _mm256_unpackhi_epi8(b_vec[0], b_vec[1]);
+
+			b_vec[0] = _mm256_permute2f128_si256(inter_vec[0], inter_vec[1], 0x20);
+			b_vec[1] = _mm256_permute2f128_si256(inter_vec[0], inter_vec[1], 0x31);
+
+			// Store B[0,0], B[1,0], B[0,1], B[1,1], ......, B[0,15], B[1,15]
+			_mm256_storeu_si256((__m256i *)(pack_b_buffer_s8s8s16o16 + ((jc * KC_updated) + (k_full_pieces * NR))), b_vec[0]);
+			// Store B[0,16], B[1,16], B[0,17], B[1,17], ......, B[0,31], B[1,31]
+			_mm256_storeu_si256((__m256i *)(pack_b_buffer_s8s8s16o16 + ((jc * KC_updated) + ((k_full_pieces + 1) * NR))), b_vec[1]);
+		}		
+        //store the sum column
+		_mm256_storeu_si256( (__m256i *)(pack_b_column_sum + jc), sum1 );
+		_mm256_storeu_si256( (__m256i *)(pack_b_column_sum + 16 + jc), sum2 );
+	}
+
+	// B matrix packing when n < NR
+	if (n_partial_pieces > 0)
+	{
+		// Split into multiple smaller fringe kernels, so as to maximize
+		// vectorization after packing. Any n0 < NR(32) can be expressed
+		// as n0 = 16 + n`.
+		dim_t n0_16 = n_partial_pieces / 16;
+		dim_t n0_partial_rem = n_partial_pieces % 16;
+
+		dim_t n0_partial_pack = 0;
+
+		if (n0_16 == 1)
+		{
+			packb_nr16_s8s8s16o16(
+				(pack_b_buffer_s8s8s16o16 +
+				 (n_full_pieces_loop_limit * KC_updated)),
+				( pack_b_column_sum + ( n_full_pieces_loop_limit ) ), 
+				(b + n_full_pieces_loop_limit), ldb, rows);
+
+			n0_partial_pack = 16;
+		}
+
+		if (n0_partial_rem > 0)
+		{
+			packb_nrlt16_s8s8s16o16(
+				(pack_b_buffer_s8s8s16o16 + (n_full_pieces_loop_limit * KC_updated) +
+				 (n0_partial_pack * KC_updated)),
+				( pack_b_column_sum + n_full_pieces_loop_limit + n0_partial_pack ),
+				(b + n_full_pieces_loop_limit + n0_partial_pack),
+				ldb, rows, n0_partial_rem);
+		}
+	}
+
+	*rs_b = NR * 2;
+	*cs_b = NR;
+}
+#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_6x32rowmajor_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c
similarity index 61%
rename from addon/aocl_gemm/kernels/u8s8s16/lpgemm_6x32rowmajor_amd256.c
rename to kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c
index f7ad5f2d23..859a377ce0 100644
--- a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_6x32rowmajor_amd256.c
+++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_6x32rowmajor_amd256.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,7 +33,9 @@
 */
 #include <immintrin.h>
 #include "blis.h"
-#include "lpgemm_kernels.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
 #include "lpgemm_s16_kern_macros.h"
 
 // 6x32 int8o16 kernel
@@ -45,6 +47,9 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 			&&POST_OPS_BIAS_6x32,
 			&&POST_OPS_RELU_6x32,
 			&&POST_OPS_RELU_SCALE_6x32,
+			&&POST_OPS_GELU_TANH_6x32,
+			&&POST_OPS_GELU_ERF_6x32,
+			&&POST_OPS_CLIP_6x32,
 			&&POST_OPS_DOWNSCALE_6x32
 		};
 
@@ -79,13 +84,11 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 				b, ((rs_b / 2) * 1), cs_b,
 				c, rs_c,
 				alpha, beta,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
+				post_ops_list, post_ops_attr);
 
 			b = b + (16 * k0_updated);
 			c = c + 16;
-			post_op_c_j += 16;
+			post_ops_attr.post_op_c_j += 16;
 		}
 
 		if (n0_rem > 0)
@@ -96,9 +99,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 				b, ((rs_b / 2) * 1), cs_b,
 				c, rs_c,
 				alpha, beta, n0_rem,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
+				post_ops_list, post_ops_attr);
 		}
 
 		// If fringe cases are encountered, return early
@@ -317,111 +318,110 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 		__m256i alphav = _mm256_set1_epi16(alpha);
 		__m256i betav = _mm256_set1_epi16(beta);
 
-		// Scale by alpha
-		c_int16_0p0 = _mm256_mullo_epi16(alphav, c_int16_0p0);
-		c_int16_0p1 = _mm256_mullo_epi16(alphav, c_int16_0p1);
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int16_0p0 = _mm256_mullo_epi16(alphav, c_int16_0p0);
+			c_int16_0p1 = _mm256_mullo_epi16(alphav, c_int16_0p1);
 
-		c_int16_1p0 = _mm256_mullo_epi16(alphav, c_int16_1p0);
-		c_int16_1p1 = _mm256_mullo_epi16(alphav, c_int16_1p1);
+			c_int16_1p0 = _mm256_mullo_epi16(alphav, c_int16_1p0);
+			c_int16_1p1 = _mm256_mullo_epi16(alphav, c_int16_1p1);
 
-		c_int16_2p0 = _mm256_mullo_epi16(alphav, c_int16_2p0);
-		c_int16_2p1 = _mm256_mullo_epi16(alphav, c_int16_2p1);
+			c_int16_2p0 = _mm256_mullo_epi16(alphav, c_int16_2p0);
+			c_int16_2p1 = _mm256_mullo_epi16(alphav, c_int16_2p1);
 
-		c_int16_3p0 = _mm256_mullo_epi16(alphav, c_int16_3p0);
-		c_int16_3p1 = _mm256_mullo_epi16(alphav, c_int16_3p1);
+			c_int16_3p0 = _mm256_mullo_epi16(alphav, c_int16_3p0);
+			c_int16_3p1 = _mm256_mullo_epi16(alphav, c_int16_3p1);
 
-		c_int16_4p0 = _mm256_mullo_epi16(alphav, c_int16_4p0);
-		c_int16_4p1 = _mm256_mullo_epi16(alphav, c_int16_4p1);
+			c_int16_4p0 = _mm256_mullo_epi16(alphav, c_int16_4p0);
+			c_int16_4p1 = _mm256_mullo_epi16(alphav, c_int16_4p1);
 
-		c_int16_5p0 = _mm256_mullo_epi16(alphav, c_int16_5p0);
-		c_int16_5p1 = _mm256_mullo_epi16(alphav, c_int16_5p1);
+			c_int16_5p0 = _mm256_mullo_epi16(alphav, c_int16_5p0);
+			c_int16_5p1 = _mm256_mullo_epi16(alphav, c_int16_5p1);
+		}
 
 		// Scale C by beta.
 		if (beta != 0)
 		{
-			// c[0,0-15]
-			__m256i selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 0)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+			// For the downscaled api (C-s8), the output C matrix values
+			// needs to be upscaled to s16 to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				S8_S16_BETA_OP(c_int16_0p0,ir,0,0,alphav,betav)
 
-			// c[0, 16-31]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 0)) + (1 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_0p1 = _mm256_add_epi16(selector1, c_int16_0p1);
+				// c[0, 16-31]
+				S8_S16_BETA_OP(c_int16_0p1,ir,0,1,alphav,betav)
 
-			// c[1,0-15]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 1)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
+				// c[1,0-15]
+				S8_S16_BETA_OP(c_int16_1p0,ir,1,0,alphav,betav)
 
-			// c[1,16-31]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 1)) + (1 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_1p1 = _mm256_add_epi16(selector1, c_int16_1p1);
+				// c[1,16-31]
+				S8_S16_BETA_OP(c_int16_1p1,ir,1,1,alphav,betav)
 
-			// c[2,0-15]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 2)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_2p0 = _mm256_add_epi16(selector1, c_int16_2p0);
+				// c[2,0-15]
+				S8_S16_BETA_OP(c_int16_2p0,ir,2,0,alphav,betav)
 
-			// c[2,16-31]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 2)) + (1 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_2p1 = _mm256_add_epi16(selector1, c_int16_2p1);
+				// c[2,16-31]
+				S8_S16_BETA_OP(c_int16_2p1,ir,2,1,alphav,betav)
 
-			// c[3,0-15]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 3)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_3p0 = _mm256_add_epi16(selector1, c_int16_3p0);
+				// c[3,0-15]
+				S8_S16_BETA_OP(c_int16_3p0,ir,3,0,alphav,betav)
 
-			// c[3,16-31]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 3)) + (1 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_3p1 = _mm256_add_epi16(selector1, c_int16_3p1);
+				// c[3,16-31]
+				S8_S16_BETA_OP(c_int16_3p1,ir,3,1,alphav,betav)
 
-			// c[4,0-15]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 4)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_4p0 = _mm256_add_epi16(selector1, c_int16_4p0);
+				// c[4,0-15]
+				S8_S16_BETA_OP(c_int16_4p0,ir,4,0,alphav,betav)
 
-			// c[4,16-31]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 4)) + (1 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_4p1 = _mm256_add_epi16(selector1, c_int16_4p1);
+				// c[4,16-31]
+				S8_S16_BETA_OP(c_int16_4p1,ir,4,1,alphav,betav)
 
-			// c[5,0-15]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 5)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_5p0 = _mm256_add_epi16(selector1, c_int16_5p0);
+				// c[5,0-15]
+				S8_S16_BETA_OP(c_int16_5p0,ir,5,0,alphav,betav)
 
-			// c[5,16-31]
-			selector1 =
-				_mm256_loadu_si256((__m256i const *)
-					(c + (rs_c * (ir + 5)) + (1 * 16)));
-			selector1 = _mm256_mullo_epi16(betav, selector1);
-			c_int16_5p1 = _mm256_add_epi16(selector1, c_int16_5p1);
+				// c[5,16-31]
+				S8_S16_BETA_OP(c_int16_5p1,ir,5,1,alphav,betav)
+			}
+			else
+			{
+				// c[0,0-15]
+				S16_S16_BETA_OP(c_int16_0p0,ir,0,0,alphav,betav)
+
+				// c[0, 16-31]
+				S16_S16_BETA_OP(c_int16_0p1,ir,0,1,alphav,betav)
+
+				// c[1,0-15]
+				S16_S16_BETA_OP(c_int16_1p0,ir,1,0,alphav,betav)
+
+				// c[1,16-31]
+				S16_S16_BETA_OP(c_int16_1p1,ir,1,1,alphav,betav)
+
+				// c[2,0-15]
+				S16_S16_BETA_OP(c_int16_2p0,ir,2,0,alphav,betav)
+
+				// c[2,16-31]
+				S16_S16_BETA_OP(c_int16_2p1,ir,2,1,alphav,betav)
+
+				// c[3,0-15]
+				S16_S16_BETA_OP(c_int16_3p0,ir,3,0,alphav,betav)
+
+				// c[3,16-31]
+				S16_S16_BETA_OP(c_int16_3p1,ir,3,1,alphav,betav)
+
+				// c[4,0-15]
+				S16_S16_BETA_OP(c_int16_4p0,ir,4,0,alphav,betav)
+
+				// c[4,16-31]
+				S16_S16_BETA_OP(c_int16_4p1,ir,4,1,alphav,betav)
+
+				// c[5,0-15]
+				S16_S16_BETA_OP(c_int16_5p0,ir,5,0,alphav,betav)
+
+				// c[5,16-31]
+				S16_S16_BETA_OP(c_int16_5p1,ir,5,1,alphav,betav)
+			}
 		}
 
 		// Post Ops
@@ -432,11 +432,11 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 			__m256i selector1 =
 				_mm256_loadu_si256( (__m256i const *)(
 					(int16_t *)post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 )) );
+								post_ops_attr.post_op_c_j + ( 0 * 16 )) );
 			__m256i selector2 =
 				_mm256_loadu_si256( (__m256i const *)(
 					(int16_t *)post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 )) );
+								post_ops_attr.post_op_c_j + ( 1 * 16 )) );
 
 			// c[0,0-15]
 			c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
@@ -563,6 +563,135 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 
 			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 		}
+POST_OPS_GELU_TANH_6x32:
+		{
+			__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+			__m256i q;
+
+			// c[0,0-15]
+			GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0,16-31]
+			GELU_TANH_S16_AVX2(c_int16_0p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1,0-15]
+			GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1,16-31]
+			GELU_TANH_S16_AVX2(c_int16_1p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2,0-15]
+			GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2,16-31]
+			GELU_TANH_S16_AVX2(c_int16_2p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3,0-15]
+			GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3,16-31]
+			GELU_TANH_S16_AVX2(c_int16_3p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4,0-15]
+			GELU_TANH_S16_AVX2(c_int16_4p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4,16-31]
+			GELU_TANH_S16_AVX2(c_int16_4p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5,0-15]
+			GELU_TANH_S16_AVX2(c_int16_5p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5,16-31]
+			GELU_TANH_S16_AVX2(c_int16_5p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x32:
+		{
+			__m256 x, r, y1, y2, x_erf;
+
+			// c[0,0-15]
+			GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+			// c[0,16-31]
+			GELU_ERF_S16_AVX2(c_int16_0p1, y1, y2, r, x, x_erf)
+
+			// c[1,0-15]
+			GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+			// c[1,16-31]
+			GELU_ERF_S16_AVX2(c_int16_1p1, y1, y2, r, x, x_erf)
+
+			// c[2,0-15]
+			GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+			// c[2,16-31]
+			GELU_ERF_S16_AVX2(c_int16_2p1, y1, y2, r, x, x_erf)
+
+			// c[3,0-15]
+			GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+			// c[3,16-31]
+			GELU_ERF_S16_AVX2(c_int16_3p1, y1, y2, r, x, x_erf)
+
+			// c[4,0-15]
+			GELU_ERF_S16_AVX2(c_int16_4p0, y1, y2, r, x, x_erf)
+
+			// c[4,16-31]
+			GELU_ERF_S16_AVX2(c_int16_4p1, y1, y2, r, x, x_erf)
+
+			// c[5,0-15]
+			GELU_ERF_S16_AVX2(c_int16_5p0, y1, y2, r, x, x_erf)
+
+			// c[5,16-31]
+			GELU_ERF_S16_AVX2(c_int16_5p1, y1, y2, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x32:
+		{
+			__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+			__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+			// c[0,0-15]
+			CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+			// c[0,16-31]
+			CLIP_S16_AVX2(c_int16_0p1, min, max)
+
+			// c[1,0-15]
+			CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+			// c[1,16-31]
+			CLIP_S16_AVX2(c_int16_1p1, min, max)
+
+			// c[2,0-15]
+			CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+			// c[2,16-31]
+			CLIP_S16_AVX2(c_int16_2p1, min, max)
+
+			// c[3,0-15]
+			CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+			// c[3,16-31]
+			CLIP_S16_AVX2(c_int16_3p1, min, max)
+
+			// c[4,0-15]
+			CLIP_S16_AVX2(c_int16_4p0, min, max)
+
+			// c[4,16-31]
+			CLIP_S16_AVX2(c_int16_4p1, min, max)
+
+			// c[5,0-15]
+			CLIP_S16_AVX2(c_int16_5p0, min, max)
+
+			// c[5,16-31]
+			CLIP_S16_AVX2(c_int16_5p1, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
 POST_OPS_DOWNSCALE_6x32:
 		{
 			__m128i temp[2];
@@ -570,74 +699,115 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 			__m256 temp_float[2];
 			__m256 scale_1, scale_2;
 			__m256 res_1, res_2;
-			__m256i store_reg;
 
 			/* Load the scale vector values into the register*/
 			scale_1 =
 				_mm256_loadu_ps(
 				(float *)post_ops_list_temp->scale_factor +
-				post_op_c_j + (0 * 8));
+				post_ops_attr.post_op_c_j + (0 * 8));
 			scale_2 =
 				_mm256_loadu_ps(
 				(float *)post_ops_list_temp->scale_factor +
-				post_op_c_j + (1 * 8));
+				post_ops_attr.post_op_c_j + (1 * 8));
 
-			BLI_MM256_S16_DOWNSCALE(c_int16_0p0, c_int16_0p1, 0);
+			// Scale first 16 columns of the 6 rows.
+			CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2)
 
-			BLI_MM256_S16_DOWNSCALE(c_int16_1p0, c_int16_1p1, 1);
-
-			BLI_MM256_S16_DOWNSCALE(c_int16_2p0, c_int16_2p1, 2);
-
-			BLI_MM256_S16_DOWNSCALE(c_int16_3p0, c_int16_3p1, 3);
-
-			BLI_MM256_S16_DOWNSCALE(c_int16_4p0, c_int16_4p1, 4);
+			scale_1 =
+				_mm256_loadu_ps(
+				(float *)post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j + (2 * 8));
+			scale_2 =
+				_mm256_loadu_ps(
+				(float *)post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j + (3 * 8));
 
-			BLI_MM256_S16_DOWNSCALE(c_int16_5p0, c_int16_5p1, 5);
+			// Scale next 16 columns of the 6 rows.
+			CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_4p1, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_5p1, scale_1, scale_2)
 
 			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 		}
 POST_OPS_6x32_DISABLE:
 		;
 
-		// Store the results.
-		// c[0,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 0 ) ) + ( 0*16 )), c_int16_0p0 );
+		// Case where the output C matrix is s8 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-31]
+			CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0);
+
+			// c[1,0-31]
+			CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0);
+
+			// c[2,0-31]
+			CVT_STORE_S16_S8(c_int16_2p0, c_int16_2p1, 2, 0);
+
+			// c[3,0-31]
+			CVT_STORE_S16_S8(c_int16_3p0, c_int16_3p1, 3, 0);
 
-		// c[0, 16-31]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 0 ) ) + ( 1*16 )), c_int16_0p1 );
+			// c[4,0-31]
+			CVT_STORE_S16_S8(c_int16_4p0, c_int16_4p1, 4, 0);
 
-		// c[1,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 0*16 )), c_int16_1p0 );
+			// c[5,0-31]
+			CVT_STORE_S16_S8(c_int16_5p0, c_int16_5p1, 5, 0);
+		}
+		// Case where the output C matrix is s16 or is the temp buffer used to
+		// store intermediate s16 accumulated values for downscaled (C-s8) api.
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 0 ) ) + ( 0*16 )), c_int16_0p0 );
 
-		// c[1,16-31]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 1*16 )), c_int16_1p1 );
+			// c[0, 16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 0 ) ) + ( 1*16 )), c_int16_0p1 );
 
-		// c[2,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 0*16 )), c_int16_2p0 );
+			// c[1,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 0*16 )), c_int16_1p0 );
+
+			// c[1,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 1*16 )), c_int16_1p1 );
+
+			// c[2,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 0*16 )), c_int16_2p0 );
 
-		// c[2,16-31]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 1*16 )), c_int16_2p1 );
+			// c[2,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 1*16 )), c_int16_2p1 );
 
-		// c[3,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 0*16 )), c_int16_3p0 );
+			// c[3,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 0*16 )), c_int16_3p0 );
 
-		// c[3,16-31]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 1*16 )), c_int16_3p1 );
+			// c[3,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 1*16 )), c_int16_3p1 );
 
-		// c[4,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 0*16 )), c_int16_4p0 );
+			// c[4,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 0*16 )), c_int16_4p0 );
 
-		// c[4,16-31]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 1*16 )), c_int16_4p1 );
+			// c[4,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 1*16 )), c_int16_4p1 );
 
-		// c[5,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 0*16 )), c_int16_5p0 );
+			// c[5,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 0*16 )), c_int16_5p0 );
 
-		// c[5,16-31]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 1*16 )), c_int16_5p1 );
+			// c[5,16-31]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 1*16 )), c_int16_5p1 );
+		}
 		
 		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
+		post_ops_attr.post_op_c_i += MR;
 	}
 
 	if (m_partial_pieces > 0)
@@ -659,14 +829,12 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
 				alpha, beta,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
+				post_ops_list, post_ops_attr);
 
 			// a pointer increment
 			a = a + (4 * ps_a);
 			m_full_pieces_loop_limit += 4;
-			post_op_c_i += 4;
+			post_ops_attr.post_op_c_i += 4;
 		}
 
 		if (m_partial2 == 1)
@@ -677,14 +845,12 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
 				alpha, beta,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
+				post_ops_list, post_ops_attr);
 
 			// a pointer increment
 			a = a + (2 * ps_a);
 			m_full_pieces_loop_limit += 2;
-			post_op_c_i += 2;
+			post_ops_attr.post_op_c_i += 2;
 		}
 
 		if (m_partial == 1)
@@ -694,10 +860,10 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x32)
 				a, rs_a, cs_a,
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
-				alpha, beta,is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
-			post_op_c_i += 1;
+				alpha, beta,
+				post_ops_list, post_ops_attr);
+			post_ops_attr.post_op_c_i += 1;
 		}
 	}
 }
+#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_m_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c
similarity index 58%
rename from addon/aocl_gemm/kernels/u8s8s16/lpgemm_m_fringe_amd256.c
rename to kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c
index 4934b8b11c..863c57a5b6 100644
--- a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_m_fringe_amd256.c
+++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_m_fringe_amd256.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,9 +33,10 @@
 */
 
 #include <immintrin.h>
-
 #include "blis.h"
-#include "lpgemm_kernels.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
 #include "lpgemm_s16_kern_macros.h"
 
 // 4x32 int8o16 kernel
@@ -49,6 +50,9 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
 			&&POST_OPS_BIAS_4x32,
 			&&POST_OPS_RELU_4x32,
 			&&POST_OPS_RELU_SCALE_4x32,
+			&&POST_OPS_GELU_TANH_4x32,
+			&&POST_OPS_GELU_ERF_4x32,
+			&&POST_OPS_CLIP_4x32,
 			&&POST_OPS_DOWNSCALE_4x32
 		};
 
@@ -195,61 +199,80 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
-	c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+		c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
 
-	c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
-	c_int16_1p1 = _mm256_mullo_epi16(selector1, c_int16_1p1);
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+		c_int16_1p1 = _mm256_mullo_epi16(selector1, c_int16_1p1);
 
-	c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
-	c_int16_2p1 = _mm256_mullo_epi16(selector1, c_int16_2p1);
+		c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+		c_int16_2p1 = _mm256_mullo_epi16(selector1, c_int16_2p1);
 
-	c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
-	c_int16_3p1 = _mm256_mullo_epi16(selector1, c_int16_3p1);
+		c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+		c_int16_3p1 = _mm256_mullo_epi16(selector1, c_int16_3p1);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
 
-		// c[0, 16-31]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (1 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p1 = _mm256_add_epi16(selector1, c_int16_0p1);
+			// c[0, 16-31]
+			S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
 
-		// c[1,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 1) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
+			// c[1,0-15]
+			S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
 
-		// c[1,16-31]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 1) + (1 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_1p1 = _mm256_add_epi16(selector1, c_int16_1p1);
+			// c[1,16-31]
+			S8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2)
 
-		// c[2,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 2) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_2p0 = _mm256_add_epi16(selector1, c_int16_2p0);
+			// c[2,0-15]
+			S8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2)
 
-		// c[2,16-31]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 2) + (1 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_2p1 = _mm256_add_epi16(selector1, c_int16_2p1);
+			// c[2,16-31]
+			S8_S16_BETA_OP(c_int16_2p1,0,2,1,selector1,selector2)
 
-		// c[3,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 3) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_3p0 = _mm256_add_epi16(selector1, c_int16_3p0);
+			// c[3,0-15]
+			S8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2)
 
-		// c[3,16-31]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 3) + (1 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_3p1 = _mm256_add_epi16(selector1, c_int16_3p1);
+			// c[3,16-31]
+			S8_S16_BETA_OP(c_int16_3p1,0,3,1,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S16_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			S16_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2)
+
+			// c[2,0-15]
+			S16_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			S16_S16_BETA_OP(c_int16_2p1,0,2,1,selector1,selector2)
+
+			// c[3,0-15]
+			S16_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2)
+
+			// c[3,16-31]
+			S16_S16_BETA_OP(c_int16_3p1,0,3,1,selector1,selector2)
+		}
 	}
 	
 	// Post Ops
@@ -259,10 +282,10 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
 	{
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 )) );
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
 		selector2 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 )) );
+							post_ops_attr.post_op_c_j + ( 1 * 16 )) );
 		
 		// c[0,0-15]
 		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
@@ -351,6 +374,99 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_4x32:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0,16-31]
+		GELU_TANH_S16_AVX2(c_int16_0p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,16-31]
+		GELU_TANH_S16_AVX2(c_int16_1p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2,0-15]
+		GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2,16-31]
+		GELU_TANH_S16_AVX2(c_int16_2p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3,0-15]
+		GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3,16-31]
+		GELU_TANH_S16_AVX2(c_int16_3p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x32:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[0,16-31]
+		GELU_ERF_S16_AVX2(c_int16_0p1, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		// c[1,16-31]
+		GELU_ERF_S16_AVX2(c_int16_1p1, y1, y2, r, x, x_erf)
+
+		// c[2,0-15]
+		GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+		// c[2,16-31]
+		GELU_ERF_S16_AVX2(c_int16_2p1, y1, y2, r, x, x_erf)
+
+		// c[3,0-15]
+		GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+		// c[3,16-31]
+		GELU_ERF_S16_AVX2(c_int16_3p1, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x32:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[0,16-31]
+		CLIP_S16_AVX2(c_int16_0p1, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		// c[1,16-31]
+		CLIP_S16_AVX2(c_int16_1p1, min, max)
+
+		// c[2,0-15]
+		CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+		// c[2,16-31]
+		CLIP_S16_AVX2(c_int16_2p1, min, max)
+
+		// c[3,0-15]
+		CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+		// c[3,16-31]
+		CLIP_S16_AVX2(c_int16_3p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_4x32:
 	{
 		__m128i temp[2];
@@ -358,55 +474,90 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x32)
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		/* Load the scale vector values into the register*/
 		scale_1 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (0 * 8));
+			post_ops_attr.post_op_c_j + (0 * 8));
 		scale_2 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (1 * 8));
+			post_ops_attr.post_op_c_j + (1 * 8));
 
-		BLI_MM256_S16_DOWNSCALE(c_int16_0p0, c_int16_0p1, 0);
+		// Scale first 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
 
-		BLI_MM256_S16_DOWNSCALE(c_int16_1p0, c_int16_1p1, 1);
-
-		BLI_MM256_S16_DOWNSCALE(c_int16_2p0, c_int16_2p1, 2);
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (2 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (3 * 8));
 
-		BLI_MM256_S16_DOWNSCALE(c_int16_3p0, c_int16_3p1, 3);
+		// Scale next 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_2p1, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_3p1, scale_1, scale_2)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_4x32_DISABLE:
 	;
 
-	// Store the results.
-	// c[0,0-15]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c *  0 ) + ( 0*16 )), c_int16_0p0 );
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-31]
+		CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0);
 
-	// c[0, 16-31]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
+		// c[1,0-31]
+		CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0);
 
-	// c[1,0-15]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0*16 )), c_int16_1p0 );
+		// c[2,0-31]
+		CVT_STORE_S16_S8(c_int16_2p0, c_int16_2p1, 2, 0);
 
-	// c[1,16-31]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 1*16 )), c_int16_1p1 );
+		// c[3,0-31]
+		CVT_STORE_S16_S8(c_int16_3p0, c_int16_3p1, 3, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0*16 )), c_int16_0p0 );
 
-	// c[2,0-15]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 2  ) + ( 0*16 )), c_int16_2p0 );
+		// c[0, 16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
 
-	// c[2,16-31]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 2 ) + ( 1*16 )), c_int16_2p1 );
+		// c[1,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0*16 )), c_int16_1p0 );
+
+		// c[1,16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 1*16 )), c_int16_1p1 );
 
-	// c[3,0-15]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 3 ) + ( 0*16 )), c_int16_3p0 );
+		// c[2,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 2 ) + ( 0*16 )), c_int16_2p0 );
 
-	// c[3,16-31]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 3 ) + ( 1*16 )), c_int16_3p1 );
+		// c[2,16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 2 ) + ( 1*16 )), c_int16_2p1 );
+
+		// c[3,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 3 ) + ( 0*16 )), c_int16_3p0 );
+
+		// c[3,16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 3 ) + ( 1*16 )), c_int16_3p1 );
+	}
 }
 
 
@@ -421,6 +572,9 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
 			&&POST_OPS_BIAS_2x32,
 			&&POST_OPS_RELU_2x32,
 			&&POST_OPS_RELU_SCALE_2x32,
+			&&POST_OPS_GELU_TANH_2x32,
+			&&POST_OPS_GELU_ERF_2x32,
+			&&POST_OPS_CLIP_2x32,
 			&&POST_OPS_DOWNSCALE_2x32
 		};
 
@@ -512,35 +666,50 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
-	c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+		c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
 
-	c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
-	c_int16_1p1 = _mm256_mullo_epi16(selector1, c_int16_1p1);
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+		c_int16_1p1 = _mm256_mullo_epi16(selector1, c_int16_1p1);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
 
-		// c[0, 16-31]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (1 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p1 = _mm256_add_epi16(selector1, c_int16_0p1);
+			// c[0, 16-31]
+			S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
 
-		// c[1,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 1) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
+			// c[1,0-15]
+			S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
 
-		// c[1,16-31]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 1) + (1 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_1p1 = _mm256_add_epi16(selector1, c_int16_1p1);
+			// c[1,16-31]
+			S8_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S16_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			S16_S16_BETA_OP(c_int16_1p1,0,1,1,selector1,selector2)
+		}
 	}
 
 		// Post Ops
@@ -550,10 +719,10 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
 	{
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 )) );
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
 		selector2 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 )) );
+							post_ops_attr.post_op_c_j + ( 1 * 16 )) );
 		
 		// c[0,0-15]
 		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
@@ -606,6 +775,63 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_2x32:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0,16-31]
+		GELU_TANH_S16_AVX2(c_int16_0p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,16-31]
+		GELU_TANH_S16_AVX2(c_int16_1p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x32:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[0,16-31]
+		GELU_ERF_S16_AVX2(c_int16_0p1, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		// c[1,16-31]
+		GELU_ERF_S16_AVX2(c_int16_1p1, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x32:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[0,16-31]
+		CLIP_S16_AVX2(c_int16_0p1, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		// c[1,16-31]
+		CLIP_S16_AVX2(c_int16_1p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_2x32:
 	{
 		__m128i temp[2];
@@ -613,39 +839,68 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x32)
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		/* Load the scale vector values into the register*/
 		scale_1 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (0 * 8));
+			post_ops_attr.post_op_c_j + (0 * 8));
 		scale_2 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (1 * 8));
+			post_ops_attr.post_op_c_j + (1 * 8));
 
-		BLI_MM256_S16_DOWNSCALE(c_int16_0p0, c_int16_0p1, 0);
+		// Scale first 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
 
-		BLI_MM256_S16_DOWNSCALE(c_int16_1p0, c_int16_1p1, 1);
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (2 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (3 * 8));
+
+		// Scale next 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p1, scale_1, scale_2)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_2x32_DISABLE:
 	;
 
-	// Store the results.
-	// c[0,0-15]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c *  0 ) + ( 0*16 )), c_int16_0p0 );
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-31]
+		CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0);
 
-	// c[0, 16-31]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
+		// c[1,0-31]
+		CVT_STORE_S16_S8(c_int16_1p0, c_int16_1p1, 1, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0*16 )), c_int16_0p0 );
 
-	// c[1,0-15]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0*16 )), c_int16_1p0 );
+		// c[0, 16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
 
-	// c[1,16-31]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 1*16 )), c_int16_1p1 );
+		// c[1,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0*16 )), c_int16_1p0 );
+
+		// c[1,16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 1*16 )), c_int16_1p1 );
+	}
 }
 
 // 1x32 int8o16 kernel
@@ -659,6 +914,9 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
 			&&POST_OPS_BIAS_1x32,
 			&&POST_OPS_RELU_1x32,
 			&&POST_OPS_RELU_SCALE_1x32,
+			&&POST_OPS_GELU_TANH_1x32,
+			&&POST_OPS_GELU_ERF_1x32,
+			&&POST_OPS_CLIP_1x32,
 			&&POST_OPS_DOWNSCALE_1x32
 		};
 
@@ -722,22 +980,35 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
-	c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+		c_int16_0p1 = _mm256_mullo_epi16(selector1, c_int16_0p1);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
 
-		// c[0, 16-31]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (1 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p1 = _mm256_add_epi16(selector1, c_int16_0p1);
+			// c[0, 16-31]
+			S8_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			S16_S16_BETA_OP(c_int16_0p1,0,0,1,selector1,selector2)
+		}
 	}
 
 		// Post Ops
@@ -747,10 +1018,10 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
 	{
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 )) );
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
 		selector2 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 )) );
+							post_ops_attr.post_op_c_j + ( 1 * 16 )) );
 		
 		// c[0,0-15]
 		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
@@ -785,6 +1056,45 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_1x32:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0,16-31]
+		GELU_TANH_S16_AVX2(c_int16_0p1, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x32:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[0,16-31]
+		GELU_ERF_S16_AVX2(c_int16_0p1, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x32:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[0,16-31]
+		CLIP_S16_AVX2(c_int16_0p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_1x32:
 	{
 		__m128i temp[2];
@@ -792,29 +1102,56 @@ LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x32)
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		/* Load the scale vector values into the register*/
 		scale_1 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (0 * 8));
+			post_ops_attr.post_op_c_j + (0 * 8));
+		scale_2 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (1 * 8));
+
+		// Scale first 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+
+		scale_1 =
+			_mm256_loadu_ps(
+			(float *)post_ops_list_temp->scale_factor +
+			post_ops_attr.post_op_c_j + (2 * 8));
 		scale_2 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (1 * 8));
+			post_ops_attr.post_op_c_j + (3 * 8));
 
-		BLI_MM256_S16_DOWNSCALE(c_int16_0p0, c_int16_0p1, 0);
+		// Scale next 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p1, scale_1, scale_2)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_1x32_DISABLE:
 	;
 
-	// Store the results.
-	// c[0,0-15]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c *  0 ) + ( 0*16 )), c_int16_0p0 );
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-31]
+		CVT_STORE_S16_S8(c_int16_0p0, c_int16_0p1, 0, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0*16 )), c_int16_0p0 );
 
-	// c[0, 16-31]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
+		// c[0, 16-31]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 1*16 )), c_int16_0p1 );
+	}
 }
+#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_mn_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c
similarity index 58%
rename from addon/aocl_gemm/kernels/u8s8s16/lpgemm_mn_fringe_amd256.c
rename to kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c
index f24455036e..e4b04e80e1 100644
--- a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_mn_fringe_amd256.c
+++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_mn_fringe_amd256.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,9 +33,10 @@
 */
 
 #include <immintrin.h>
-
 #include "blis.h"
-#include "lpgemm_kernels.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
 #include "lpgemm_s16_kern_macros.h"
 
 // 4x32 int8o16 kernel
@@ -49,6 +50,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
 			&&POST_OPS_BIAS_4x16,
 			&&POST_OPS_RELU_4x16,
 			&&POST_OPS_RELU_SCALE_4x16,
+			&&POST_OPS_GELU_TANH_4x16,
+			&&POST_OPS_GELU_ERF_4x16,
+			&&POST_OPS_CLIP_4x16,
 			&&POST_OPS_DOWNSCALE_4x16
 		};
 
@@ -168,37 +172,52 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
 
-	c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
 
-	c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+		c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
 
-	c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+		c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
 
-		// c[1,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 1) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
+			// c[1,0-15]
+			S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
 
-		// c[2,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 2) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_2p0 = _mm256_add_epi16(selector1, c_int16_2p0);
+			// c[2,0-15]
+			S8_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2)
 
-		// c[3,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 3) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_3p0 = _mm256_add_epi16(selector1, c_int16_3p0);
+			// c[3,0-15]
+			S8_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+
+			// c[2,0-15]
+			S16_S16_BETA_OP(c_int16_2p0,0,2,0,selector1,selector2)
+
+			// c[3,0-15]
+			S16_S16_BETA_OP(c_int16_3p0,0,3,0,selector1,selector2)
+		}
 	}
 
 	// Post Ops
@@ -208,8 +227,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
 	{
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 )) );
-							
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+
 		// c[0,0-15]
 		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
 
@@ -261,6 +280,63 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_4x16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2,0-15]
+		GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3,0-15]
+		GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		// c[2,0-15]
+		GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+		// c[3,0-15]
+		GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		// c[2,0-15]
+		CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+		// c[3,0-15]
+		CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_4x16:
 	{
 		__m128i temp[2];
@@ -268,39 +344,59 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4x16)
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		/* Load the scale vector values into the register*/
 		scale_1 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (0 * 8));
+			post_ops_attr.post_op_c_j + (0 * 8));
 		scale_2 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (1 * 8));
+			post_ops_attr.post_op_c_j + (1 * 8));
 
-		BLI_MM256_S16_DOWNSCALE2(c_int16_0p0, c_int16_1p0, 0, 1);
-
-		BLI_MM256_S16_DOWNSCALE2(c_int16_2p0, c_int16_3p0, 2, 3);
+		// Scale first 16 columns of the 4 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_4x16_DISABLE:
 	;
 
-	// Store the results.
-	// c[0,0-15]
-	_mm256_storeu_si256((__m256i *)(c + (rs_c * 0) + (0 * 16)), c_int16_0p0);
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
 
-	// c[1,0-15]
-	_mm256_storeu_si256((__m256i *)(c + (rs_c * 1) + (0 * 16)), c_int16_1p0);
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0);
 
-	// c[2,0-15]
-	_mm256_storeu_si256((__m256i *)(c + (rs_c * 2) + (0 * 16)), c_int16_2p0);
+		// c[2-3,0-15]
+		CVT_STORE_S16_S8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0 * 16 ) ), c_int16_0p0 );
 
-	// c[3,0-15]
-	_mm256_storeu_si256((__m256i *)(c + (rs_c * 3) + (0 * 16)), c_int16_3p0);
+		// c[1,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0 * 16 ) ), c_int16_1p0 );
+
+		// c[2,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 2 ) + ( 0 * 16 ) ), c_int16_2p0 );
+
+		// c[3,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 3 ) + ( 0 * 16 ) ), c_int16_3p0 );
+	}
 }
 
 // 4x16 int8o16 kernel
@@ -314,6 +410,9 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
 			&&POST_OPS_BIAS_4xlt16,
 			&&POST_OPS_RELU_4xlt16,
 			&&POST_OPS_RELU_SCALE_4xlt16,
+			&&POST_OPS_GELU_TANH_4xlt16,
+			&&POST_OPS_GELU_ERF_4xlt16,
+			&&POST_OPS_CLIP_4xlt16,
 			&&POST_OPS_DOWNSCALE_4xlt16
 		};
 
@@ -441,42 +540,66 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
 
-	c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
 
-	c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+		c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
 
-	c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+		c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		memcpy(buf0, (c + (rs_c * 0)), (n0_rem * sizeof(int16_t)));
-		memcpy(buf1, (c + (rs_c * 1)), (n0_rem * sizeof(int16_t)));
-		memcpy(buf2, (c + (rs_c * 2)), (n0_rem * sizeof(int16_t)));
-		memcpy(buf3, (c + (rs_c * 3)), (n0_rem * sizeof(int16_t)));
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
 
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)buf0);
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes);
 
-		// c[1,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)buf1);
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
+			// c[0,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
 
-		// c[2,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)buf2);
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_2p0 = _mm256_add_epi16(selector1, c_int16_2p0);
+			// c[1,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
 
-		// c[3,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)buf3);
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_3p0 = _mm256_add_epi16(selector1, c_int16_3p0);
+			// c[2,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2)
+
+			// c[3,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2)
+		}
+		else
+		{
+			dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+			memcpy( buf0, ( c + ( rs_c * 0 ) ), n0_rem_bytes );
+			memcpy( buf1, ( c + ( rs_c * 1 ) ), n0_rem_bytes );
+			memcpy( buf2, ( c + ( rs_c * 2 ) ), n0_rem_bytes );
+			memcpy( buf3, ( c + ( rs_c * 3 ) ), n0_rem_bytes );
+
+			// c[0,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+
+			// c[2,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2)
+
+			// c[3,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2)
+		}
 	}
 
 	// Post Ops
@@ -484,9 +607,9 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
 	POST_OP_LABEL_LASTK_SAFE_JUMP
 POST_OPS_BIAS_4xlt16:
 	{
-		memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1 
-			+ post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
-		
+		memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1
+			+ post_ops_attr.post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
+
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *) buf0 );
 
@@ -541,6 +664,63 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_4xlt16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2,0-15]
+		GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3,0-15]
+		GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4xlt16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		// c[2,0-15]
+		GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+		// c[3,0-15]
+		GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4xlt16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		// c[2,0-15]
+		CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+		// c[3,0-15]
+		CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_4xlt16:
 	{
 		__m128i temp[2];
@@ -548,49 +728,78 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_4xlt16)
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		float float_buf[16];
-		int8_t store_buf[16];
 
 		memcpy( float_buf, ( ( float* )post_ops_list_temp->scale_factor +
-				post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
 
 		// Load the scale vector values into the register
 		scale_1 = _mm256_loadu_ps(float_buf + (0 * 8));
 		scale_2 = _mm256_loadu_ps(float_buf + (1 * 8));
 
-		BLI_MM256_S16_DOWNSCALE2_LT16(c_int16_0p0, c_int16_1p0, 0, 1)
-
-		BLI_MM256_S16_DOWNSCALE2_LT16(c_int16_2p0, c_int16_3p0, 2, 3)
+		// Scale first 16 columns of the 6 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_4xlt16_DISABLE:
 	;
 
-	// c[0,0-15]
-	_mm256_storeu_si256((__m256i_u *)buf0, c_int16_0p0);
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
 
-	// c[1,0-15]
-	_mm256_storeu_si256((__m256i_u *)buf1, c_int16_1p0);
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1);
 
-	// c[2,0-15]
-	_mm256_storeu_si256((__m256i_u *)buf2, c_int16_2p0);
+		// c[2-3,0-15]
+		CVT_STORE_S16_S8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3);
 
-	// c[3,0-15]
-	_mm256_storeu_si256((__m256i_u *)buf3, c_int16_3p0);
+		dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
 
-	memcpy(c + (rs_c * 0) + (0 * 16), buf0, (n0_rem * sizeof(int16_t)));
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes);
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf0, c_int16_0p0 );
 
-	// c[1,0-15]
-	memcpy(c + (rs_c * +1) + (0 * 16), buf1, (n0_rem * sizeof(int16_t)));
+		// c[1,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf1, c_int16_1p0 );
 
-	// c[2,0-15]
-	memcpy(c + (rs_c * +2) + (0 * 16), buf2, (n0_rem * sizeof(int16_t)));
+		// c[2,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf2, c_int16_2p0 );
+
+		// c[3,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf3, c_int16_3p0 );
+
+		dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+		memcpy( c + ( rs_c * 0 ) + ( 0 * 16 ), buf0, n0_rem_bytes );
+
+		// c[1,0-15]
+		memcpy( c + ( rs_c * 1 ) + ( 0 * 16 ), buf1, n0_rem_bytes );
 
-	// c[3,0-15]
-	memcpy(c + (rs_c * +3) + (0 * 16), buf3, (n0_rem * sizeof(int16_t)));
+		// c[2,0-15]
+		memcpy( c + ( rs_c * 2 ) + ( 0 * 16 ), buf2, n0_rem_bytes );
+
+		// c[3,0-15]
+		memcpy( c + ( rs_c * 3 ) + ( 0 * 16 ), buf3, n0_rem_bytes );
+	}
 }
 
 // 2x16 int8o16 kernel
@@ -604,6 +813,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
 			&&POST_OPS_BIAS_2x16,
 			&&POST_OPS_RELU_2x16,
 			&&POST_OPS_RELU_SCALE_2x16,
+			&&POST_OPS_GELU_TANH_2x16,
+			&&POST_OPS_GELU_ERF_2x16,
+			&&POST_OPS_CLIP_2x16,	
 			&&POST_OPS_DOWNSCALE_2x16
 		};
 
@@ -681,23 +893,36 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
 
-	c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
 
-		// c[1,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 1) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
+			// c[1,0-15]
+			S8_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP(c_int16_1p0,0,1,0,selector1,selector2)
+		}
 	}
 
 	// Post Ops
@@ -707,8 +932,8 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
 	{
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 )) );
-		
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
+
 		// c[0,0-15]
 		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
 
@@ -742,6 +967,45 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_2x16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_2x16:
 	{
 		__m128i temp[2];
@@ -749,31 +1013,48 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2x16)
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		/* Load the scale vector values into the register*/
 		scale_1 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (0 * 8));
+			post_ops_attr.post_op_c_j + (0 * 8));
 		scale_2 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (1 * 8));
+			post_ops_attr.post_op_c_j + (1 * 8));
 
-		BLI_MM256_S16_DOWNSCALE2(c_int16_0p0, c_int16_1p0, 0, 1);
+		// Scale first 16 columns of the 2 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_2x16_DISABLE:
 	;
 
-	// Store the results.
-	// c[0,0-15]
-	_mm256_storeu_si256((__m256i *)(c + (rs_c * 0) + (0 * 16)), c_int16_0p0);
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
 
-	// c[1,0-15]
-	_mm256_storeu_si256((__m256i *)(c + (rs_c * 1) + (0 * 16)), c_int16_1p0);
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0 * 16 ) ), c_int16_0p0 );
+
+		// c[1,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 1 ) + ( 0 * 16 ) ), c_int16_1p0 );
+	}
 }
 
 // 2xlt16 int8o16 kernel
@@ -787,6 +1068,9 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
 			&&POST_OPS_BIAS_2xlt16,
 			&&POST_OPS_RELU_2xlt16,
 			&&POST_OPS_RELU_SCALE_2xlt16,
+			&&POST_OPS_GELU_TANH_2xlt16,
+			&&POST_OPS_GELU_ERF_2xlt16,
+			&&POST_OPS_CLIP_2xlt16,
 			&&POST_OPS_DOWNSCALE_2xlt16
 		};
 
@@ -867,26 +1151,46 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
 
-	c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		memcpy(buf0, (c + (rs_c * 0)), (n0_rem * sizeof(int16_t)));
-		memcpy(buf1, (c + (rs_c * 1)), (n0_rem * sizeof(int16_t)));
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
 
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)buf0);
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
 
-		// c[1,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)buf1);
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
+			// c[0,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+			// c[1,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+		}
+		else
+		{
+			dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+			memcpy( buf0, ( c + ( rs_c * 0 ) ), n0_rem_bytes );
+			memcpy( buf1, ( c + ( rs_c * 1 ) ), n0_rem_bytes );
+
+			// c[0,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+			// c[1,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+		}
 	}
 
 	// Post Ops
@@ -895,11 +1199,11 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
 POST_OPS_BIAS_2xlt16:
 	{
 		memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1 +
-			post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
+			post_ops_attr.post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
 
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *) buf0);
-		
+
 		// c[0,0-15]
 		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
 
@@ -933,6 +1237,45 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_2xlt16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1,0-15]
+		GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2xlt16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		// c[1,0-15]
+		GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2xlt16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		// c[1,0-15]
+		CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_2xlt16:
 	{
 		__m128i temp[2];
@@ -940,36 +1283,59 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_2xlt16)
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		float float_buf[16];
-		int8_t store_buf[16];
 
 		memcpy( float_buf, ( ( float* )post_ops_list_temp->scale_factor +
-				post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
 
 		// Load the scale vector values into the register
 		scale_1 = _mm256_loadu_ps(float_buf + (0 * 8));
 		scale_2 = _mm256_loadu_ps(float_buf + (1 * 8));
 
-		BLI_MM256_S16_DOWNSCALE2_LT16(c_int16_0p0, c_int16_1p0, 0, 1)
+		// Scale first 16 columns of the 6 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+		CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_2xlt16_DISABLE:
 	;
 
-	// c[0,0-15]
-	_mm256_storeu_si256((__m256i_u *)buf0, c_int16_0p0);
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1);
 
-	// c[1,0-15]
-	_mm256_storeu_si256((__m256i_u *)buf1, c_int16_1p0);
+		dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
 
-	// c[0,0-15]
-	memcpy(c + (rs_c * 0) + (0 * 16), buf0, (n0_rem * sizeof(int16_t)));
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf0, c_int16_0p0 );
 
-	// c[1,0-15]
-	memcpy(c + (rs_c * 1) + (0 * 16), buf1, (n0_rem * sizeof(int16_t)));
+		// c[1,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf1, c_int16_1p0 );
+
+		dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+		memcpy( c + ( rs_c * 0 ) + ( 0 * 16 ), buf0, n0_rem_bytes );
+
+		// c[1,0-15]
+		memcpy( c + ( rs_c * 1 ) + ( 0 * 16 ), buf1, n0_rem_bytes );
+	}
 }
 
 // 1x16 int8o16 kernel
@@ -983,6 +1349,9 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16)
 			&&POST_OPS_BIAS_1x16,
 			&&POST_OPS_RELU_1x16,
 			&&POST_OPS_RELU_SCALE_1x16,
+			&&POST_OPS_GELU_TANH_1x16,
+			&&POST_OPS_GELU_ERF_1x16,
+			&&POST_OPS_CLIP_1x16,
 			&&POST_OPS_DOWNSCALE_1x16
 		};
 
@@ -1038,16 +1407,28 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * 0) + (0 * 16)));
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			S8_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			S16_S16_BETA_OP(c_int16_0p0,0,0,0,selector1,selector2)
+		}
 	}
 
 	// Post Ops
@@ -1057,7 +1438,7 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16)
 	{
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 )) );
+							post_ops_attr.post_op_c_j + ( 0 * 16 )) );
 
 		// c[0,0-15]
 		c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
@@ -1083,37 +1464,82 @@ LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1x16)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_1x16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_1x16:
 	{
 		__m128i temp[2];
-		__m256i temp_32[2], zero_reg;
+		__m256i temp_32[2];
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		/* Load the scale vector values into the register*/
 		scale_1 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (0 * 8));
+			post_ops_attr.post_op_c_j + (0 * 8));
 		scale_2 =
 			_mm256_loadu_ps(
 			(float *)post_ops_list_temp->scale_factor +
-			post_op_c_j + (1 * 8));
+			post_ops_attr.post_op_c_j + (1 * 8));
 
-		zero_reg = _mm256_setzero_si256();
-
-		BLI_MM256_S16_DOWNSCALE2_EDGE(c_int16_0p0, 0)
+		// Scale first 16 columns of the 2 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_1x16_DISABLE:
 	;
 
-	// Store the results.
-	// c[0,0-15]
-	_mm256_storeu_si256( (__m256i *)(c + ( rs_c *  0 ) + ( 0*16 )), c_int16_0p0 );
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+		__m256i zero_reg = _mm256_setzero_si256();
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_1ROW(c_int16_0p0, zero_reg, 0, 0);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * 0 ) + ( 0 * 16 ) ), c_int16_0p0 );
+	}
 }
 
 // 1xlt16 int8o16 kernel
@@ -1127,6 +1553,9 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16)
 			&&POST_OPS_BIAS_1xlt16,
 			&&POST_OPS_RELU_1xlt16,
 			&&POST_OPS_RELU_SCALE_1xlt16,
+			&&POST_OPS_GELU_TANH_1xlt16,
+			&&POST_OPS_GELU_ERF_1xlt16,
+			&&POST_OPS_CLIP_1xlt16,
 			&&POST_OPS_DOWNSCALE_1xlt16
 		};
 
@@ -1184,18 +1613,36 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16)
 	__m256i selector1 = _mm256_set1_epi16(alpha);
 	__m256i selector2 = _mm256_set1_epi16(beta);
 
-	// Scale by alpha
-	c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+	}
 
 	// Scale C by beta.
 	if (beta != 0)
 	{
-		memcpy(buf0, (c + (rs_c * 0)), (n0_rem * sizeof(int16_t)));
+		// For the downscaled api (C-s8), the output C matrix values
+		// needs to be upscaled to s16 to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
 
-		// c[0,0-15]
-		selector1 = _mm256_loadu_si256((__m256i const *)buf0);
-		selector1 = _mm256_mullo_epi16(selector2, selector1);
-		c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+			S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+
+			// c[0,0-15]
+			S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+		}
+		else
+		{
+			dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+			memcpy( buf0, ( c + ( rs_c * 0 ) ), n0_rem_bytes );
+
+			// c[0,0-15]
+			S16_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+		}
 	}
 
 	// Post Ops
@@ -1203,8 +1650,8 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16)
 	POST_OP_LABEL_LASTK_SAFE_JUMP
 POST_OPS_BIAS_1xlt16:
 	{
-		memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1 
-			+ post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
+		memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1
+			+ post_ops_attr.post_op_c_j + ( 0 * 16 ) ), ( n0_rem * sizeof( int16_t ) ) );
 
 		selector1 =
 			_mm256_loadu_si256( (__m256i const *)buf0 );
@@ -1233,34 +1680,88 @@ LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_1xlt16)
 
 		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
+POST_OPS_GELU_TANH_1xlt16:
+	{
+		__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+		__m256i q;
+
+		// c[0,0-15]
+		GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1xlt16:
+	{
+		__m256 x, r, y1, y2, x_erf;
+
+		// c[0,0-15]
+		GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1xlt16:
+	{
+		__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+		__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+		// c[0,0-15]
+		CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
 POST_OPS_DOWNSCALE_1xlt16:
 	{
 		__m128i temp[2];
-		__m256i temp_32[2], zero_reg;
+		__m256i temp_32[2];
 		__m256 temp_float[2];
 		__m256 scale_1, scale_2;
 		__m256 res_1, res_2;
-		__m256i store_reg;
 
 		float float_buf[16];
-		int8_t store_buf[16];
 
 		memcpy( float_buf, ( ( float* )post_ops_list_temp->scale_factor +
-				post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
 
 		// Load the scale vector values into the register
 		scale_1 = _mm256_loadu_ps(float_buf + (0 * 8));
 		scale_2 = _mm256_loadu_ps(float_buf + (1 * 8));
 
-		zero_reg = _mm256_setzero_si256();
+		// Scale first 16 columns of the 2 rows.
+		CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
 
-		BLI_MM256_S16_DOWNSCALE2_EDGE_LT16(c_int16_0p0, 0)
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 	}
 POST_OPS_1xlt16_DISABLE:
 	;
 
-	// c[0,0-15]
-	_mm256_storeu_si256((__m256i_u *)buf0, c_int16_0p0);
+	// Case where the output C matrix is s8 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+		 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Store the results in downscaled type (int8 instead of int32).
+		__m128i temp[2];
+		__m256i zero_reg = _mm256_setzero_si256();
+
+		// c[0-1,0-15]
+		CVT_STORE_S16_S8_1ROW_NLT16(c_int16_0p0, zero_reg, buf0);
+
+		dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
 
-	memcpy(c + (rs_c * 0) + (0 * 16), buf0, (n0_rem * sizeof(int16_t)));
+		CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+	}
+	// Case where the output C matrix is s16 or is the temp buffer used to
+	// store intermediate s16 accumulated values for downscaled (C-s8) api.
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm256_storeu_si256( ( __m256i* )buf0, c_int16_0p0 );
+
+		dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+
+		memcpy( c + ( rs_c * 0 ) + ( 0 * 16 ), buf0, n0_rem_bytes );
+	}
 }
+#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_n_fringe_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c
similarity index 61%
rename from addon/aocl_gemm/kernels/u8s8s16/lpgemm_n_fringe_amd256.c
rename to kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c
index b24d49dac7..a3270f3091 100644
--- a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_n_fringe_amd256.c
+++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_n_fringe_amd256.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,9 +33,10 @@
 */
 
 #include <immintrin.h>
-
 #include "blis.h"
-#include "lpgemm_kernels.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
 #include "lpgemm_s16_kern_macros.h"
 
 // 6x16 int8o16 kernel
@@ -50,6 +51,9 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
 			&&POST_OPS_BIAS_6x16,
 			&&POST_OPS_RELU_6x16,
 			&&POST_OPS_RELU_SCALE_6x16,
+			&&POST_OPS_GELU_TANH_6x16,
+			&&POST_OPS_GELU_ERF_6x16,
+			&&POST_OPS_CLIP_6x16,
 			&&POST_OPS_DOWNSCALE_6x16
 		};
 
@@ -222,51 +226,68 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
 		__m256i selector1 = _mm256_set1_epi16(alpha);
 		__m256i selector2 = _mm256_set1_epi16(beta);
 
-		// Scale by alpha
-		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
 
-		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+			c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
 
-		c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+			c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
 
-		c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+			c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
 
-		c_int16_4p0 = _mm256_mullo_epi16(selector1, c_int16_4p0);
+			c_int16_4p0 = _mm256_mullo_epi16(selector1, c_int16_4p0);
 
-		c_int16_5p0 = _mm256_mullo_epi16(selector1, c_int16_5p0);
+			c_int16_5p0 = _mm256_mullo_epi16(selector1, c_int16_5p0);
+		}
 
 		// Scale C by beta.
 		if (beta != 0)
 		{
-			// c[0,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * (ir + 0)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
+			// For the downscaled api (C-s8), the output C matrix values
+			// needs to be upscaled to s16 to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				S8_S16_BETA_OP(c_int16_0p0,ir,0,0,selector1,selector2)
 
-			// c[1,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * (ir + 1)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
+				// c[1,0-15]
+				S8_S16_BETA_OP(c_int16_1p0,ir,1,0,selector1,selector2)
 
-			// c[2,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * (ir + 2)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_2p0 = _mm256_add_epi16(selector1, c_int16_2p0);
+				// c[2,0-15]
+				S8_S16_BETA_OP(c_int16_2p0,ir,2,0,selector1,selector2)
 
-			// c[3,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * (ir + 3)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_3p0 = _mm256_add_epi16(selector1, c_int16_3p0);
+				// c[3,0-15]
+				S8_S16_BETA_OP(c_int16_3p0,ir,3,0,selector1,selector2)
 
-			// c[4,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * (ir + 4)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_4p0 = _mm256_add_epi16(selector1, c_int16_4p0);
+				// c[4,0-15]
+				S8_S16_BETA_OP(c_int16_4p0,ir,4,0,selector1,selector2)
 
-			// c[5,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)(c + (rs_c * (ir + 5)) + (0 * 16)));
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_5p0 = _mm256_add_epi16(selector1, c_int16_5p0);
+				// c[5,0-15]
+				S8_S16_BETA_OP(c_int16_5p0,ir,5,0,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				S16_S16_BETA_OP(c_int16_0p0,ir,0,0,selector1,selector2)
+
+				// c[1,0-15]
+				S16_S16_BETA_OP(c_int16_1p0,ir,1,0,selector1,selector2)
+
+				// c[2,0-15]
+				S16_S16_BETA_OP(c_int16_2p0,ir,2,0,selector1,selector2)
+
+				// c[3,0-15]
+				S16_S16_BETA_OP(c_int16_3p0,ir,3,0,selector1,selector2)
+
+				// c[4,0-15]
+				S16_S16_BETA_OP(c_int16_4p0,ir,4,0,selector1,selector2)
+
+				// c[5,0-15]
+				S16_S16_BETA_OP(c_int16_5p0,ir,5,0,selector1,selector2)
+			}
 		}
 
 		// Post Ops
@@ -276,7 +297,7 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
 		{
 			selector1 =
 				_mm256_loadu_si256( (__m256i const *)((int16_t *)post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 )) );
+								post_ops_attr.post_op_c_j + ( 0 * 16 )) );
 			
 			// c[0,0-15]
 			c_int16_0p0 = _mm256_add_epi16( selector1, c_int16_0p0 );
@@ -347,6 +368,81 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
 
 			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 		}
+POST_OPS_GELU_TANH_6x16:
+		{
+			__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+			__m256i q;
+
+			// c[0,0-15]
+			GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1,0-15]
+			GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2,0-15]
+			GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3,0-15]
+			GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4,0-15]
+			GELU_TANH_S16_AVX2(c_int16_4p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5,0-15]
+			GELU_TANH_S16_AVX2(c_int16_5p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x16:
+		{
+			__m256 x, r, y1, y2, x_erf;
+
+			// c[0,0-15]
+			GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+			// c[1,0-15]
+			GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+			// c[2,0-15]
+			GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+			// c[3,0-15]
+			GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+			// c[4,0-15]
+			GELU_ERF_S16_AVX2(c_int16_4p0, y1, y2, r, x, x_erf)
+
+			// c[5,0-15]
+			GELU_ERF_S16_AVX2(c_int16_5p0, y1, y2, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x16:
+		{
+			__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+			__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+			// c[0,0-15]
+			CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+			// c[1,0-15]
+			CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+			// c[2,0-15]
+			CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+			// c[3,0-15]
+			CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+			// c[4,0-15]
+			CLIP_S16_AVX2(c_int16_4p0, min, max)
+
+			// c[5,0-15]
+			CLIP_S16_AVX2(c_int16_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
 POST_OPS_DOWNSCALE_6x16:
 		{
 			__m128i temp[2];
@@ -354,50 +450,73 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
 			__m256 temp_float[2];
 			__m256 scale_1, scale_2;
 			__m256 res_1, res_2;
-			__m256i store_reg;
 
 			/* Load the scale vector values into the register*/
 			scale_1 =
 				_mm256_loadu_ps(
 				(float *)post_ops_list_temp->scale_factor +
-				post_op_c_j + (0 * 8));
+				post_ops_attr.post_op_c_j + (0 * 8));
 			scale_2 =
 				_mm256_loadu_ps(
 				(float *)post_ops_list_temp->scale_factor +
-				post_op_c_j + (1 * 8));
-
-			BLI_MM256_S16_DOWNSCALE2(c_int16_0p0, c_int16_1p0, 0, 1);
-
-			BLI_MM256_S16_DOWNSCALE2(c_int16_2p0, c_int16_3p0, 2, 3);
+				post_ops_attr.post_op_c_j + (1 * 8));
 
-			BLI_MM256_S16_DOWNSCALE2(c_int16_4p0, c_int16_5p0, 4, 5);
+			// Scale first 16 columns of the 6 rows.
+			CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2)
 
 			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 		}
 POST_OPS_6x16_DISABLE:
 		;
 
-		// Store the results.
-		// c[0,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c *  ( ir + 0 ) ) + ( 0 * 16 ) ), c_int16_0p0 );
+		// Case where the output C matrix is s8 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Store the results in downscaled type (int8 instead of int32).
+			__m128i temp[2];
 
-		// c[1,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 0 * 16 ) ), c_int16_1p0 );
+			// c[0-1,0-15]
+			CVT_STORE_S16_S8_2ROW(c_int16_0p0, c_int16_1p0, 0, 1, 0);
 
-		// c[2,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 0 * 16 ) ), c_int16_2p0 );
+			// c[2-3,0-15]
+			CVT_STORE_S16_S8_2ROW(c_int16_2p0, c_int16_3p0, 2, 3, 0);
 
-		// c[3,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 0 * 16 ) ), c_int16_3p0 );
+			// c[4-5,0-15]
+			CVT_STORE_S16_S8_2ROW(c_int16_4p0, c_int16_5p0, 4, 5, 0);
+		}
+		// Case where the output C matrix is s16 or is the temp buffer used to
+		// store intermediate s16 accumulated values for downscaled (C-s8) api.
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c *  ( ir + 0 ) ) + ( 0 * 16 ) ), c_int16_0p0 );
 
-		// c[4,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 0 * 16 ) ), c_int16_4p0 );
+			// c[1,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 1 ) ) + ( 0 * 16 ) ), c_int16_1p0 );
+
+			// c[2,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 2 ) ) + ( 0 * 16 ) ), c_int16_2p0 );
+
+			// c[3,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 3 ) ) + ( 0 * 16 ) ), c_int16_3p0 );
+
+			// c[4,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 4 ) ) + ( 0 * 16 ) ), c_int16_4p0 );
+
+			// c[5,0-15]
+			_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 0 * 16 ) ), c_int16_5p0 );
+		}
 
-		// c[5,0-15]
-		_mm256_storeu_si256( (__m256i *)(c + ( rs_c * ( ir + 5 ) ) + ( 0 * 16 ) ), c_int16_5p0 );
-		
 		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
+		post_ops_attr.post_op_c_i += MR;
 	}
 
 	if (m_partial_pieces > 0)
@@ -416,14 +535,12 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
 				alpha, beta,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
+				post_ops_list, post_ops_attr);
 
 			// a pointer increment
 			a = a + (4 * ps_a);
 			m_full_pieces_loop_limit += 4;
-			post_op_c_i += 4;
+			post_ops_attr.post_op_c_i += 4;
 		}
 
 		if (m_partial2 == 1)
@@ -434,14 +551,12 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
 				alpha, beta,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
+				post_ops_list, post_ops_attr);
 
 			// a pointer increment
 			a = a + (2 * ps_a);
 			m_full_pieces_loop_limit += 2;
-			post_op_c_i += 2;
+			post_ops_attr.post_op_c_i += 2;
 		}
 
 		if (m_partial == 1)
@@ -452,10 +567,8 @@ LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6x16)
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
 				alpha, beta,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
-			post_op_c_i += 1;
+				post_ops_list, post_ops_attr);
+			post_ops_attr.post_op_c_i += 1;
 		}
 	}
 }
@@ -471,6 +584,9 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
 			&&POST_OPS_BIAS_6xlt16,
 			&&POST_OPS_RELU_6xlt16,
 			&&POST_OPS_RELU_SCALE_6xlt16,
+			&&POST_OPS_GELU_TANH_6xlt16,
+			&&POST_OPS_GELU_ERF_6xlt16,
+			&&POST_OPS_CLIP_6xlt16,
 			&&POST_OPS_DOWNSCALE_6xlt16
 		};
 
@@ -649,58 +765,85 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
 		__m256i selector1 = _mm256_set1_epi16(alpha);
 		__m256i selector2 = _mm256_set1_epi16(beta);
 
-		// Scale by alpha
-		c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int16_0p0 = _mm256_mullo_epi16(selector1, c_int16_0p0);
 
-		c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
+			c_int16_1p0 = _mm256_mullo_epi16(selector1, c_int16_1p0);
 
-		c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
+			c_int16_2p0 = _mm256_mullo_epi16(selector1, c_int16_2p0);
 
-		c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
+			c_int16_3p0 = _mm256_mullo_epi16(selector1, c_int16_3p0);
 
-		c_int16_4p0 = _mm256_mullo_epi16(selector1, c_int16_4p0);
+			c_int16_4p0 = _mm256_mullo_epi16(selector1, c_int16_4p0);
 
-		c_int16_5p0 = _mm256_mullo_epi16(selector1, c_int16_5p0);
+			c_int16_5p0 = _mm256_mullo_epi16(selector1, c_int16_5p0);
+		}
 
 		// Scale C by beta.
 		if (beta != 0)
 		{
-			memcpy(buf0, (c + (rs_c * (ir + 0))), (n0_rem * sizeof(int16_t)));
-			memcpy(buf1, (c + (rs_c * (ir + 1))), (n0_rem * sizeof(int16_t)));
-			memcpy(buf2, (c + (rs_c * (ir + 2))), (n0_rem * sizeof(int16_t)));
-			memcpy(buf3, (c + (rs_c * (ir + 3))), (n0_rem * sizeof(int16_t)));
-			memcpy(buf4, (c + (rs_c * (ir + 4))), (n0_rem * sizeof(int16_t)));
-			memcpy(buf5, (c + (rs_c * (ir + 5))), (n0_rem * sizeof(int16_t)));
-
-			// c[0,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)buf0);
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_0p0 = _mm256_add_epi16(selector1, c_int16_0p0);
-
-			// c[1,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)buf1);
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_1p0 = _mm256_add_epi16(selector1, c_int16_1p0);
-
-			// c[2,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)buf2);
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_2p0 = _mm256_add_epi16(selector1, c_int16_2p0);
-
-			// c[3,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)buf3);
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_3p0 = _mm256_add_epi16(selector1, c_int16_3p0);
-
-			// c[4,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)buf4);
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_4p0 = _mm256_add_epi16(selector1, c_int16_4p0);
-
-			// c[5,0-15]
-			selector1 = _mm256_loadu_si256((__m256i const *)buf5);
-			selector1 = _mm256_mullo_epi16(selector2, selector1);
-			c_int16_5p0 = _mm256_add_epi16(selector1, c_int16_5p0);
+			// For the downscaled api (C-s8), the output C matrix values
+			// needs to be upscaled to s16 to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
+
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes);
+				S8_S16_BETA_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes);
+
+				// c[0,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+				// c[1,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+
+				// c[2,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2)
+
+				// c[3,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2)
+
+				// c[4,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_4p0,buf4,selector1,selector2)
+
+				// c[5,0-15]
+				S8_S16_BETA_OP_NLT16(c_int16_5p0,buf5,selector1,selector2)
+			}
+			else
+			{
+				dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
+				memcpy( buf0, ( c + ( rs_c * ( ir + 0 ) ) ), n0_rem_bytes );
+				memcpy( buf1, ( c + ( rs_c * ( ir + 1 ) ) ), n0_rem_bytes );
+				memcpy( buf2, ( c + ( rs_c * ( ir + 2 ) ) ), n0_rem_bytes );
+				memcpy( buf3, ( c + ( rs_c * ( ir + 3 ) ) ), n0_rem_bytes );
+				memcpy( buf4, ( c + ( rs_c * ( ir + 4 ) ) ), n0_rem_bytes );
+				memcpy( buf5, ( c + ( rs_c * ( ir + 5 ) ) ), n0_rem_bytes );
+
+				// c[0,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_0p0,buf0,selector1,selector2)
+
+				// c[1,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_1p0,buf1,selector1,selector2)
+
+				// c[2,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_2p0,buf2,selector1,selector2)
+
+				// c[3,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_3p0,buf3,selector1,selector2)
+
+				// c[4,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_4p0,buf4,selector1,selector2)
+
+				// c[5,0-15]
+				S16_S16_BETA_OP_NLT16(c_int16_5p0,buf5,selector1,selector2)
+			}
 		}
 
 		// Post Ops
@@ -709,7 +852,7 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
 POST_OPS_BIAS_6xlt16:
 		{
 			memcpy( buf0, ( ( int16_t* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) ),
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) ),
 							( n0_rem * sizeof( int16_t ) ) );
 
 			selector1 =
@@ -784,74 +927,182 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
 
 			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 		}
-		POST_OPS_DOWNSCALE_6xlt16:
+POST_OPS_GELU_TANH_6xlt16:
+		{
+			__m256 dn, z, x, r2, r, y1, y2, x_tanh;
+			__m256i q;
+
+			// c[0,0-15]
+			GELU_TANH_S16_AVX2(c_int16_0p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1,0-15]
+			GELU_TANH_S16_AVX2(c_int16_1p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2,0-15]
+			GELU_TANH_S16_AVX2(c_int16_2p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3,0-15]
+			GELU_TANH_S16_AVX2(c_int16_3p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4,0-15]
+			GELU_TANH_S16_AVX2(c_int16_4p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5,0-15]
+			GELU_TANH_S16_AVX2(c_int16_5p0, y1, y2, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6xlt16:
+		{
+			__m256 x, r, y1, y2, x_erf;
+
+			// c[0,0-15]
+			GELU_ERF_S16_AVX2(c_int16_0p0, y1, y2, r, x, x_erf)
+
+			// c[1,0-15]
+			GELU_ERF_S16_AVX2(c_int16_1p0, y1, y2, r, x, x_erf)
+
+			// c[2,0-15]
+			GELU_ERF_S16_AVX2(c_int16_2p0, y1, y2, r, x, x_erf)
+
+			// c[3,0-15]
+			GELU_ERF_S16_AVX2(c_int16_3p0, y1, y2, r, x, x_erf)
+
+			// c[4,0-15]
+			GELU_ERF_S16_AVX2(c_int16_4p0, y1, y2, r, x, x_erf)
+
+			// c[5,0-15]
+			GELU_ERF_S16_AVX2(c_int16_5p0, y1, y2, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6xlt16:
+		{
+			__m256i min = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args2 );
+			__m256i max = _mm256_set1_epi16( *( int16_t* )post_ops_list_temp->op_args3 );
+
+			// c[0,0-15]
+			CLIP_S16_AVX2(c_int16_0p0, min, max)
+
+			// c[1,0-15]
+			CLIP_S16_AVX2(c_int16_1p0, min, max)
+
+			// c[2,0-15]
+			CLIP_S16_AVX2(c_int16_2p0, min, max)
+
+			// c[3,0-15]
+			CLIP_S16_AVX2(c_int16_3p0, min, max)
+
+			// c[4,0-15]
+			CLIP_S16_AVX2(c_int16_4p0, min, max)
+
+			// c[5,0-15]
+			CLIP_S16_AVX2(c_int16_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6xlt16:
 		{
 			__m128i temp[2];
 			__m256i temp_32[2];
 			__m256 temp_float[2];
 			__m256 scale_1, scale_2;
 			__m256 res_1, res_2;
-			__m256i store_reg;
 
 			float float_buf[16];
-			int8_t store_buf[16];
 
 			memcpy( float_buf, ( ( float* )post_ops_list_temp->scale_factor +
-					post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+					post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
 
 			// Load the scale vector values into the register
 			scale_1 = _mm256_loadu_ps(float_buf + (0 * 8));
 			scale_2 = _mm256_loadu_ps(float_buf + (1 * 8));
 
-			BLI_MM256_S16_DOWNSCALE2_LT16(c_int16_0p0, c_int16_1p0, 0, 1)
-
-			BLI_MM256_S16_DOWNSCALE2_LT16(c_int16_2p0, c_int16_3p0, 2, 3)
-
-			BLI_MM256_S16_DOWNSCALE2_LT16(c_int16_4p0, c_int16_5p0, 4, 5)
+			// Scale first 16 columns of the 6 rows.
+			CVT_MULRND_CVT16(c_int16_0p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_1p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_2p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_3p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_4p0, scale_1, scale_2)
+			CVT_MULRND_CVT16(c_int16_5p0, scale_1, scale_2)
 
 			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 		}
 POST_OPS_6xlt16_DISABLE:
 		;
 
-		// Store the results.
-		// c[0,0-15]
-		_mm256_storeu_si256((__m256i *)buf0, c_int16_0p0);
+		// Case where the output C matrix is s8 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Store the results in downscaled type (int8 instead of int32).
+			__m128i temp[2];
+
+			// c[0-1,0-15]
+			CVT_STORE_S16_S8_2ROW_NLT16(c_int16_0p0, c_int16_1p0, buf0, buf1);
 
-		// c[1,0-15]
-		_mm256_storeu_si256((__m256i *)buf1, c_int16_1p0);
+			// c[2-3,0-15]
+			CVT_STORE_S16_S8_2ROW_NLT16(c_int16_2p0, c_int16_3p0, buf2, buf3);
 
-		// c[2,0-15]
-		_mm256_storeu_si256((__m256i *)buf2, c_int16_2p0);
+			// c[4-5,0-15]
+			CVT_STORE_S16_S8_2ROW_NLT16(c_int16_4p0, c_int16_5p0, buf4, buf5);
 
-		// c[3,0-15]
-		_mm256_storeu_si256((__m256i *)buf3, c_int16_3p0);
+			dim_t n0_rem_dscale_bytes = n0_rem * sizeof( int8_t );
 
-		// c[4,0-15]
-		_mm256_storeu_si256((__m256i *)buf4, c_int16_4p0);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf0, 0, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf1, 1, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf2, 2, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf3, 3, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf4, 4, n0_rem_dscale_bytes);
+			CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf5, 5, n0_rem_dscale_bytes);
+		}
+		// Case where the output C matrix is s16 or is the temp buffer used to
+		// store intermediate s16 accumulated values for downscaled (C-s8) api.
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf0, c_int16_0p0 );
 
-		// c[5,0-15]
-		_mm256_storeu_si256((__m256i *)buf5, c_int16_5p0);
+			// c[1,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf1, c_int16_1p0 );
 
-		memcpy(c + (rs_c * (ir + 0)) + (0 * 16), buf0, (n0_rem * sizeof(int16_t)));
+			// c[2,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf2, c_int16_2p0 );
 
-		// c[1,0-15]
-		memcpy(c + (rs_c * (ir + 1)) + (0 * 16), buf1, (n0_rem * sizeof(int16_t)));
+			// c[3,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf3, c_int16_3p0 );
 
-		// c[2,0-15]
-		memcpy(c + (rs_c * (ir + 2)) + (0 * 16), buf2, (n0_rem * sizeof(int16_t)));
+			// c[4,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf4, c_int16_4p0 );
+
+			// c[5,0-15]
+			_mm256_storeu_si256( ( __m256i* )buf5, c_int16_5p0 );
 
-		// c[3,0-15]
-		memcpy(c + (rs_c * (ir + 3)) + (0 * 16), buf3, (n0_rem * sizeof(int16_t)));
+			dim_t n0_rem_bytes = n0_rem * sizeof( int16_t );
 
-		// c[4,0-15]
-		memcpy(c + (rs_c * (ir + 4)) + (0 * 16), buf4, (n0_rem * sizeof(int16_t)));
+			memcpy( c + ( rs_c * ( ir + 0 ) ) + ( 0 * 16 ), buf0, n0_rem_bytes );
 
-		// c[5,0-15]
-		memcpy(c + (rs_c * (ir + 5)) + (0 * 16), buf5, (n0_rem * sizeof(int16_t)));
+			// c[1,0-15]
+			memcpy( c + ( rs_c * ( ir + 1 ) ) + ( 0 * 16 ), buf1, n0_rem_bytes );
+
+			// c[2,0-15]
+			memcpy( c + ( rs_c * ( ir + 2 ) ) + ( 0 * 16 ), buf2, n0_rem_bytes );
+
+			// c[3,0-15]
+			memcpy( c + ( rs_c * ( ir + 3 ) ) + ( 0 * 16 ), buf3, n0_rem_bytes );
+
+			// c[4,0-15]
+			memcpy( c + ( rs_c * ( ir + 4 ) ) + ( 0 * 16 ), buf4, n0_rem_bytes );
+
+			// c[5,0-15]
+			memcpy( c + ( rs_c * ( ir + 5 ) ) + ( 0 * 16 ), buf5, n0_rem_bytes );
+		}
 
 		a = a + (MR * ps_a);
-		post_op_c_i += MR;
+		post_ops_attr.post_op_c_i += MR;
 	}
 
 	if (m_partial_pieces > 0)
@@ -870,14 +1121,12 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
 				alpha, beta, n0_rem,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
+				post_ops_list, post_ops_attr);
 
 			// a pointer increment
 			a = a + (4 * ps_a);
 			m_full_pieces_loop_limit += 4;
-			post_op_c_i += 4;
+			post_ops_attr.post_op_c_i += 4;
 		}
 
 		if (m_partial2 == 1)
@@ -888,14 +1137,12 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
 				alpha, beta, n0_rem,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
+				post_ops_list, post_ops_attr);
 
 			// a pointer increment
 			a = a + (2 * ps_a);
 			m_full_pieces_loop_limit += 2;
-			post_op_c_i += 2;
+			post_ops_attr.post_op_c_i += 2;
 		}
 
 		if (m_partial == 1)
@@ -906,10 +1153,9 @@ LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int16_t,u8s8s16o16_6xlt16)
 				b, rs_b, cs_b,
 				(c + (rs_c * m_full_pieces_loop_limit)), rs_c,
 				alpha, beta, n0_rem,
-				is_last_k,
-				post_op_c_i, post_op_c_j,
-				post_ops_list, rs_c_downscale);
-			post_op_c_i += 1;
+				post_ops_list, post_ops_attr);
+			post_ops_attr.post_op_c_i += 1;
 		}
 	}
 }
+#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_amd256.c b/kernels/zen/lpgemm/u8s8s16/lpgemm_packb_amd256.c
similarity index 96%
rename from addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_amd256.c
rename to kernels/zen/lpgemm/u8s8s16/lpgemm_packb_amd256.c
index ac9cb469e3..ef629707f1 100644
--- a/addon/aocl_gemm/kernels/u8s8s16/lpgemm_packb_amd256.c
+++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_packb_amd256.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,20 +33,9 @@
 */
 
 #include <immintrin.h>
-
 #include "blis.h"
-#include "lpgemm_packb_s16.h"
-#include "lpgemm_config.h"
 
-void get_packb_nr32_u8s8s16o16_strides
-	(
-		dim_t *rs_b,
-		dim_t *cs_b
-	)
-{
-	*rs_b = lpgemm_get_block_size_NR_global_cntx( U8S8S16OS16 ) * 2;
-	*cs_b = lpgemm_get_block_size_NR_global_cntx( U8S8S16OS16 );
-}
+#ifdef BLIS_ADDON_LPGEMM
 
 void packb_nrlt16_u8s8s16o16
 	(
@@ -265,4 +254,5 @@ void packb_nr32_u8s8s16o16(
 
 	*rs_b = NR * 2;
 	*cs_b = NR;
-}
\ No newline at end of file
+}
+#endif
diff --git a/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h b/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h
new file mode 100644
index 0000000000..1ce68ed498
--- /dev/null
+++ b/kernels/zen/lpgemm/u8s8s16/lpgemm_s16_kern_macros.h
@@ -0,0 +1,285 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_S16_KERN_MACROS_H
+#define LPGEMM_S16_KERN_MACROS_H
+
+#include "../gelu_avx2.h"
+#include "../math_utils_avx2.h"
+
+#define S8_MIN  (-128)
+#define S8_MAX  (+127)
+
+/* ReLU scale (Parametric ReLU):  f(x) = x, when x > 0 and f(x) = a*x when x <= 0 */
+#define RELU_SCALE_OP_S16_AVX2(reg) \
+	selector1 = _mm256_setzero_si256();\
+	selector1 = _mm256_cmpgt_epi16 ( selector1, reg ); \
+ \
+	/* Only < 0 elements in b0. */ \
+	b0 = _mm256_and_si256 ( selector1, reg ); \
+\
+	/* Only >= 0 elements in c_int16_0p0. */ \
+	reg = _mm256_andnot_si256( selector1, reg ); \
+ \
+	/* Only scaling for < 0 elements. */ \
+	b0 = _mm256_mullo_epi16( b0, selector2 ); \
+ \
+	/* Combine the scaled < 0 and >= 0 elements. */ \
+	reg = _mm256_or_si256( b0, reg ); \
+
+// s16 fma macro
+#define S16_BETA_FMA(reg,scratch1,scratch2) \
+	scratch1 = _mm256_mullo_epi16( scratch2, scratch1 ); \
+	reg = _mm256_add_epi16( scratch1, reg ); \
+
+// Beta scale macro, scratch2=beta
+#define S16_S16_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = \
+	_mm256_loadu_si256 \
+	( \
+	  ( __m256i const* )( c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ) \
+	); \
+	S16_BETA_FMA(reg,scratch1,scratch2) \
+
+// Beta n < 16 scale macro, scratch2=beta
+#define S16_S16_BETA_OP_NLT16(reg,buf_,scratch1,scratch2) \
+	scratch1 = _mm256_loadu_si256( ( __m256i const* )buf_ ); \
+	S16_BETA_FMA(reg,scratch1,scratch2) \
+
+// Downscale beta scale macro, scratch2=beta
+#define S8_S16_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = \
+	_mm256_cvtepi8_epi16 \
+	( \
+	  _mm_loadu_si128 \
+	  ( \
+	    ( __m128i const* )( ( int8_t* )post_ops_attr.buf_downscale + \
+	    ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+	    post_ops_attr.post_op_c_j + ( n_ind * 16 ) )\
+	  ) \
+	); \
+	S16_BETA_FMA(reg,scratch1,scratch2) \
+
+// Downscale beta n < 16 scale macro, scratch2=beta
+#define S8_S16_BETA_OP_NLT16(reg,buf_,scratch1,scratch2) \
+	scratch1 = _mm256_cvtepi8_epi16( _mm_loadu_si128( ( __m128i const* )buf_ ) ); \
+	S16_BETA_FMA(reg,scratch1,scratch2) \
+
+#define S8_S16_BETA_NLT16_MEMCP_UTIL(buf_,m_ind,bytes) \
+	memcpy \
+	( \
+	  buf_, \
+	  ( ( int8_t* )post_ops_attr.buf_downscale + \
+		( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+		post_ops_attr.post_op_c_j ), bytes \
+	); \
+ 
+// Downscale macro
+#define CVT_MULRND_CVT16(reg, scale0, scale1) \
+ \
+	/* Extract the first 128 bits of the register*/ \
+	temp[0] = _mm256_extractf128_si256( reg, 0 ); \
+	/* Extract the second 128 bits of the register*/ \
+	temp[1] = _mm256_extractf128_si256( reg, 1 ); \
+ \
+	temp_32[0] = _mm256_cvtepi16_epi32( temp[0] ); \
+	temp_32[1] = _mm256_cvtepi16_epi32( temp[1] ); \
+	temp_float[0] = _mm256_cvtepi32_ps( temp_32[0] ); \
+	temp_float[1] = _mm256_cvtepi32_ps( temp_32[1] ); \
+ \
+	/* Multiply the C matrix by the scale value*/ \
+	res_1 = _mm256_mul_ps( temp_float[0], scale0 ); \
+	res_2 = _mm256_mul_ps( temp_float[1], scale1 ); \
+ \
+	/* Round the resultant value to the nearest float value and clip the values between [-128, 127] */ \
+	res_1 = \
+	_mm256_min_ps \
+	( \
+	  _mm256_max_ps \
+	  ( \
+	    _mm256_round_ps \
+	    ( \
+	      res_1, ( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) \
+	    ), \
+	    _mm256_set1_ps( ( float )S8_MIN ) \
+	  ), \
+	  _mm256_set1_ps( ( float )S8_MAX ) \
+	);\
+	res_2 = \
+	_mm256_min_ps \
+	( \
+	  _mm256_max_ps \
+	  ( \
+	    _mm256_round_ps \
+	    ( \
+	      res_2, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) \
+	    ), \
+	    _mm256_set1_ps( ( float )S8_MIN ) \
+	  ), \
+	  _mm256_set1_ps( ( float )S8_MAX ) \
+	);\
+ \
+	/* Convert the clipped float32 scaled rounded value to int32 */ \
+	temp_32[0] = _mm256_cvtps_epi32( res_1 ); \
+	temp_32[1] = _mm256_cvtps_epi32( res_2 ); \
+ \
+	/* Convert the s32 to s16 */ \
+	reg = _mm256_packs_epi32( temp_32[0], temp_32[1] ); \
+ \
+	/*Permute to make sure the order is correct*/ \
+	reg = _mm256_permute4x64_epi64( reg, 0XD8 ); \
+
+// Downscale store macro
+#define CVT_STORE_S16_S8(reg0, reg1, m_ind, n_ind) \
+   /* Convert the s16 to s8 */ \
+	reg0 = _mm256_packs_epi16( reg0, reg1 ); \
+	reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \
+ \
+	_mm256_storeu_si256 \
+	( \
+	  ( __m256i* )( ( int8_t* )post_ops_attr.buf_downscale + \
+	  ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+	  post_ops_attr.post_op_c_j + ( n_ind * 32 ) ), \
+	  reg0 \
+	) \
+
+// Downscale store macro for fringe cases
+#define CVT_STORE_S16_S8_2ROW(reg0, reg1, m_ind0, m_ind1, n_ind) \
+	/* Convert the s16 to s8 */ \
+	reg0 = _mm256_packs_epi16( reg0, reg1 ); \
+	reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \
+ \
+	/* Extract the first 128 bits of the register*/ \
+	temp[0] = _mm256_extractf128_si256( reg0, 0 ); \
+	/* Extract the second 128 bits of the register*/ \
+	temp[1] = _mm256_extractf128_si256( reg0, 1 ); \
+ \
+	_mm_storeu_si128 \
+	( \
+	  ( __m128i* )( ( int8_t* )post_ops_attr.buf_downscale + \
+	  ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind0 ) ) + \
+	  post_ops_attr.post_op_c_j + ( n_ind * 16 ) ), \
+	  temp[0] \
+	); \
+	_mm_storeu_si128 \
+	( \
+	  ( __m128i* )( ( int8_t* )post_ops_attr.buf_downscale + \
+	  ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind1 ) ) + \
+	  post_ops_attr.post_op_c_j + ( n_ind * 16 ) ), \
+	  temp[1] \
+	); \
+
+// Downscale store macro for fringe cases
+#define CVT_STORE_S16_S8_1ROW(reg0, reg1, m_ind0, n_ind) \
+	/* Convert the s16 to s8 */ \
+	reg0 = _mm256_packs_epi16( reg0, reg1 ); \
+	reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \
+ \
+	/* Extract the first 128 bits of the register*/ \
+	temp[0] = _mm256_extractf128_si256( reg0, 0 ); \
+ \
+	_mm_storeu_si128 \
+	( \
+	  ( __m128i* )( ( int8_t* )post_ops_attr.buf_downscale + \
+	  ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind0 ) ) + \
+	  post_ops_attr.post_op_c_j + ( n_ind * 16 ) ), \
+	  temp[0] \
+	); \
+
+// Downscale store macro for n < 16 fringe cases
+#define CVT_STORE_S16_S8_2ROW_NLT16(reg0, reg1, buf0, buf1) \
+	/* Convert the s16 to s8 */ \
+	reg0 = _mm256_packs_epi16( reg0, reg1 ); \
+	reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \
+ \
+	/* Extract the first 128 bits of the register*/ \
+	temp[0] = _mm256_extractf128_si256( reg0, 0 ); \
+	/* Extract the second 128 bits of the register*/ \
+	temp[1] = _mm256_extractf128_si256( reg0, 1 ); \
+ \
+	_mm_storeu_si128( ( __m128i* )buf0, temp[0] ); \
+	_mm_storeu_si128( ( __m128i* )buf1, temp[1] ); \
+
+// Downscale store macro for n < 16 fringe cases
+#define CVT_STORE_S16_S8_1ROW_NLT16(reg0, reg1, buf0) \
+	/* Convert the s16 to s8 */ \
+	reg0 = _mm256_packs_epi16( reg0, reg1 ); \
+	reg0 = _mm256_permute4x64_epi64( reg0, 0XD8 ); \
+ \
+	/* Extract the first 128 bits of the register*/ \
+	temp[0] = _mm256_extractf128_si256( reg0, 0 ); \
+ \
+	_mm_storeu_si128( ( __m128i* )buf0, temp[0] ); \
+
+#define CVT_STORE_S16_S8_NLT16_MEMCP_UTIL(buf_,m_ind,bytes) \
+	memcpy \
+	( \
+	  ( ( int8_t* )post_ops_attr.buf_downscale + \
+		( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+		post_ops_attr.post_op_c_j ), buf_, bytes \
+	); \
+
+//--------------------------------------------------------------------------
+/* GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */
+#define GELU_TANH_S16_AVX2(reg, y1, y2, r, r2, x, z, dn, x_tanh, q) \
+\
+	y1 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(reg, 0)) ); \
+	y2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(reg, 1)) ); \
+\
+	GELU_TANH_F32_AVX2_DEF(y1, r, r2, x, z, dn, x_tanh, q); \
+\
+	GELU_TANH_F32_AVX2_DEF(y2, r, r2, x, z, dn, x_tanh, q); \
+\
+	reg = _mm256_packs_epi32(_mm256_cvtps_epi32(y1), _mm256_cvtps_epi32(y2));\
+	reg = _mm256_permute4x64_epi64(reg, 0XD8);\
+
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_S16_AVX2(reg, y1, y2, r, x, x_erf) \
+\
+	y1 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(reg, 0)) ); \
+	y2 = _mm256_cvtepi32_ps( _mm256_cvtepi16_epi32(_mm256_extractf128_si256(reg, 1)) ); \
+\
+	GELU_ERF_F32_AVX2_DEF(y1, r, x, x_erf); \
+\
+	GELU_ERF_F32_AVX2_DEF(y2, r, x, x_erf); \
+\
+	reg = _mm256_packs_epi32(_mm256_cvtps_epi32(y1), _mm256_cvtps_epi32(y2));\
+	reg = _mm256_permute4x64_epi64(reg, 0XD8);\
+
+#define CLIP_S16_AVX2(reg, min, max) \
+\
+	reg = _mm256_min_epi16( _mm256_max_epi16( reg, min ), max ); \
+
+#endif //LPGEMM_S16_KERN_MACROS_H
diff --git a/kernels/zen4/1/CMakeLists.txt b/kernels/zen4/1/CMakeLists.txt
index 7bd499efb6..9bfb5d650e 100644
--- a/kernels/zen4/1/CMakeLists.txt
+++ b/kernels/zen4/1/CMakeLists.txt
@@ -1,6 +1,14 @@
-##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}" 
-     PRIVATE
+add_library(zen4_1
+     OBJECT
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_amaxv_zen_int_avx512.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_scalv_zen_int_avx512.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_dotv_zen_int_avx512.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_axpyv_zen_int_avx512.c
     )
+
+target_compile_options(zen4_1 PRIVATE /arch:AVX2 /arch:AVX512)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen4_1 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/zen4/1/bli_amaxv_zen_int_avx512.c b/kernels/zen4/1/bli_amaxv_zen_int_avx512.c
index 9e32f955a8..85c3f0d356 100644
--- a/kernels/zen4/1/bli_amaxv_zen_int_avx512.c
+++ b/kernels/zen4/1/bli_amaxv_zen_int_avx512.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -627,8 +627,8 @@ void bli_samaxv_zen_int_avx512(
 
     // Issue vzeroupper instruction to clear upper lanes of ymm registers.
     // This avoids a performance penalty caused by false dependencies when
-    // transitioning from from AVX to SSE instructions (which may occur
-    // later, especially if BLIS is compiled with -mfpmath=sse).
+    // transitioning from AVX to SSE instructions (which may occur later,
+    // especially if BLIS is compiled with -mfpmath=sse).
     _mm256_zeroupper();
 
     /* Store final index to output variable. */
@@ -959,8 +959,8 @@ void bli_damaxv_zen_int_avx512(
 
     // Issue vzeroupper instruction to clear upper lanes of ymm registers.
     // This avoids a performance penalty caused by false dependencies when
-    // transitioning from from AVX to SSE instructions (which may occur
-    // later, especially if BLIS is compiled with -mfpmath=sse).
+    // transitioning from AVX to SSE instructions (which may occur later,
+    // especially if BLIS is compiled with -mfpmath=sse).
     _mm256_zeroupper();
 
     // Return value
diff --git a/kernels/zen4/1/bli_axpyv_zen_int_avx512.c b/kernels/zen4/1/bli_axpyv_zen_int_avx512.c
new file mode 100644
index 0000000000..23b1f2f039
--- /dev/null
+++ b/kernels/zen4/1/bli_axpyv_zen_int_avx512.c
@@ -0,0 +1,447 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+// -----------------------------------------------------------------------------
+
+/*
+    Functionality
+    -------------
+
+    This function calculates y := y + alpha * x where all three variables are of type
+    float.
+
+    Function Signature
+    -------------------
+
+    This function takes three float pointer as input, the correspending vector's stride
+    and length. It uses the function parameters to return the output.
+
+    * 'conjx' - Info about conjugation of x (This variable is not used in the kernel)
+    * 'n' - Length of the array passed
+    * 'alpha' - Float pointer to a scalar value
+    * 'x' - Float pointer to an array
+    * 'incx' - Stride to point to the next element in the array
+    * 'y' - Float pointer to an array
+    * 'incy' - Stride to point to the next element in the array
+    * 'cntx' - BLIS context object
+
+    Exception
+    ----------
+
+    None
+
+    Deviation from BLAS
+    --------------------
+
+    None
+
+    Undefined behaviour
+    -------------------
+
+    1. The kernel results in undefined behaviour when n <= 0, incx <= 0 and incy <= 0.
+       The expectation is that these are standard BLAS exceptions and should be handled in
+       a higher layer
+*/
+void bli_saxpyv_zen_int_avx512
+     (
+       conj_t           conjx,
+       dim_t            n,
+       float*  restrict alpha,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const int n_elem_per_reg = 16;
+
+    dim_t i = 0;
+
+    // Initialize local pointers.
+    float *restrict x0 = x;
+    float *restrict y0 = y;
+
+    if (incx == 1 && incy == 1)
+    {
+        __m512 xv[8], yv[8], alphav;
+
+        // Broadcast the alpha scalar to all elements of a vector register.
+        alphav = _mm512_set1_ps(*alpha);
+
+        for (i = 0; (i + 127) < n; i += 128)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_ps(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_ps(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_ps(x0 + 3 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_ps(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_ps(y0 + 1 * n_elem_per_reg);
+            yv[2] = _mm512_loadu_ps(y0 + 2 * n_elem_per_reg);
+            yv[3] = _mm512_loadu_ps(y0 + 3 * n_elem_per_reg);
+
+            // Perform y += alpha * x
+            yv[0] = _mm512_fmadd_ps(xv[0], alphav, yv[0]);
+            yv[1] = _mm512_fmadd_ps(xv[1], alphav, yv[1]);
+            yv[2] = _mm512_fmadd_ps(xv[2], alphav, yv[2]);
+            yv[3] = _mm512_fmadd_ps(xv[3], alphav, yv[3]);
+
+            // Store updated y
+            _mm512_storeu_ps((y0 + 0 * n_elem_per_reg), yv[0]);
+            _mm512_storeu_ps((y0 + 1 * n_elem_per_reg), yv[1]);
+            _mm512_storeu_ps((y0 + 2 * n_elem_per_reg), yv[2]);
+            _mm512_storeu_ps((y0 + 3 * n_elem_per_reg), yv[3]);
+
+            xv[4] = _mm512_loadu_ps(x0 + 4 * n_elem_per_reg);
+            xv[5] = _mm512_loadu_ps(x0 + 5 * n_elem_per_reg);
+            xv[6] = _mm512_loadu_ps(x0 + 6 * n_elem_per_reg);
+            xv[7] = _mm512_loadu_ps(x0 + 7 * n_elem_per_reg);
+
+            yv[4] = _mm512_loadu_ps(y0 + 4 * n_elem_per_reg);
+            yv[5] = _mm512_loadu_ps(y0 + 5 * n_elem_per_reg);
+            yv[6] = _mm512_loadu_ps(y0 + 6 * n_elem_per_reg);
+            yv[7] = _mm512_loadu_ps(y0 + 7 * n_elem_per_reg);
+
+            yv[4] = _mm512_fmadd_ps(xv[4], alphav, yv[4]);
+            yv[5] = _mm512_fmadd_ps(xv[5], alphav, yv[5]);
+            yv[6] = _mm512_fmadd_ps(xv[6], alphav, yv[6]);
+            yv[7] = _mm512_fmadd_ps(xv[7], alphav, yv[7]);
+
+            _mm512_storeu_ps((y0 + 7 * n_elem_per_reg), yv[7]);
+            _mm512_storeu_ps((y0 + 6 * n_elem_per_reg), yv[6]);
+            _mm512_storeu_ps((y0 + 5 * n_elem_per_reg), yv[5]);
+            _mm512_storeu_ps((y0 + 4 * n_elem_per_reg), yv[4]);
+
+            // Increment the pointer
+            x0 += 8 * n_elem_per_reg;
+            y0 += 8 * n_elem_per_reg;
+        }
+
+        for (; (i + 63) < n; i += 64)
+        {
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_ps(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_ps(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_ps(x0 + 3 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_ps(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_ps(y0 + 1 * n_elem_per_reg);
+            yv[2] = _mm512_loadu_ps(y0 + 2 * n_elem_per_reg);
+            yv[3] = _mm512_loadu_ps(y0 + 3 * n_elem_per_reg);
+
+            yv[0] = _mm512_fmadd_ps(xv[0], alphav, yv[0]);
+            yv[1] = _mm512_fmadd_ps(xv[1], alphav, yv[1]);
+            yv[2] = _mm512_fmadd_ps(xv[2], alphav, yv[2]);
+            yv[3] = _mm512_fmadd_ps(xv[3], alphav, yv[3]);
+
+            _mm512_storeu_ps((y0 + 0 * n_elem_per_reg), yv[0]);
+            _mm512_storeu_ps((y0 + 1 * n_elem_per_reg), yv[1]);
+            _mm512_storeu_ps((y0 + 2 * n_elem_per_reg), yv[2]);
+            _mm512_storeu_ps((y0 + 3 * n_elem_per_reg), yv[3]);
+
+            x0 += 4 * n_elem_per_reg;
+            y0 += 4 * n_elem_per_reg;
+        }
+
+        for (; (i + 31) < n; i += 32)
+        {
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_ps(x0 + 1 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_ps(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_ps(y0 + 1 * n_elem_per_reg);
+
+            yv[0] = _mm512_fmadd_ps(xv[0], alphav, yv[0]);
+            yv[1] = _mm512_fmadd_ps(xv[1], alphav, yv[1]);
+
+            _mm512_storeu_ps((y0 + 0 * n_elem_per_reg), yv[0]);
+            _mm512_storeu_ps((y0 + 1 * n_elem_per_reg), yv[1]);
+
+            x0 += 2 * n_elem_per_reg;
+            y0 += 2 * n_elem_per_reg;
+        }
+
+        for (; (i + 15) < n; i += 16)
+        {
+            xv[0] = _mm512_loadu_ps(x0);
+
+            yv[0] = _mm512_loadu_ps(y0);
+
+            yv[0] = _mm512_fmadd_ps(xv[0], alphav, yv[0]);
+
+            _mm512_storeu_ps(y0, yv[0]);
+
+            x0 += n_elem_per_reg;
+            y0 += n_elem_per_reg;
+        }
+
+        // This loop uses AVX2 instructions
+        for (; (i + 7) < n; i += 8)
+        {
+            __m256 x_vec = _mm256_loadu_ps(x0);
+
+            __m256 y_vec = _mm256_loadu_ps(y0);
+
+            y_vec = _mm256_fmadd_ps(x_vec, _mm256_set1_ps(*alpha), y_vec);
+
+            _mm256_storeu_ps(y0, y_vec);
+
+            x0 += 8;
+            y0 += 8;
+        }
+    }
+
+    /*
+        This loop has two functions:
+        1. Handles the remainder of n / 8 when incx and incy are 1.
+        2. Performs the complete compute when incx or incy != 1
+    */
+    for (; i < n; i += 1)
+    {
+        *y0 += (*alpha) * (*x0);
+
+        x0 += incx;
+        y0 += incy;
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
+}
+
+// -----------------------------------------------------------------------------
+
+/*
+    Functionality
+    -------------
+
+    This function calculates y := y + alpha * x where all three variables are of type
+    double.
+
+    Function Signature
+    -------------------
+
+    This function takes three float pointer as input, the correspending vector's stride
+    and length. It uses the function parameters to return the output.
+
+    * 'conjx' - Info about conjugation of x (This variable is not used in the kernel)
+    * 'n' - Length of the array passed
+    * 'alpha' - Double pointer to a scalar value
+    * 'x' - Double pointer to an array
+    * 'incx' - Stride to point to the next element in the array
+    * 'y' - Double pointer to an array
+    * 'incy' - Stride to point to the next element in the array
+    * 'cntx' - BLIS context object
+
+    Exception
+    ----------
+
+    None
+
+    Deviation from BLAS
+    --------------------
+
+    None
+
+    Undefined behaviour
+    -------------------
+
+    1. The kernel results in undefined behaviour when n <= 0, incx <= 0 and incy <= 0.
+       The expectation is that these are standard BLAS exceptions and should be handled in
+       a higher layer
+*/
+void bli_daxpyv_zen_int_avx512
+     (
+       conj_t           conjx,
+       dim_t            n,
+       double*  restrict alpha,
+       double*  restrict x, inc_t incx,
+       double*  restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const int n_elem_per_reg = 8;
+
+    dim_t i = 0;
+
+    // Initialize local pointers.
+    double *restrict x0 = x;
+    double *restrict y0 = y;
+
+    if (incx == 1 && incy == 1)
+    {
+        __m512d xv[8], yv[8], alphav;
+
+        // Broadcast the alpha scalar to all elements of a vector register.
+        alphav = _mm512_set1_pd(*alpha);
+
+        for (i = 0; (i + 63) < n; i += 64)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_pd(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_pd(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_pd(x0 + 3 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_pd(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_pd(y0 + 1 * n_elem_per_reg);
+            yv[2] = _mm512_loadu_pd(y0 + 2 * n_elem_per_reg);
+            yv[3] = _mm512_loadu_pd(y0 + 3 * n_elem_per_reg);
+
+            // Perform y += alpha * x
+            yv[0] = _mm512_fmadd_pd(xv[0], alphav, yv[0]);
+            yv[1] = _mm512_fmadd_pd(xv[1], alphav, yv[1]);
+            yv[2] = _mm512_fmadd_pd(xv[2], alphav, yv[2]);
+            yv[3] = _mm512_fmadd_pd(xv[3], alphav, yv[3]);
+
+            // Store updated y
+            _mm512_storeu_pd((y0 + 0 * n_elem_per_reg), yv[0]);
+            _mm512_storeu_pd((y0 + 1 * n_elem_per_reg), yv[1]);
+            _mm512_storeu_pd((y0 + 2 * n_elem_per_reg), yv[2]);
+            _mm512_storeu_pd((y0 + 3 * n_elem_per_reg), yv[3]);
+
+            xv[4] = _mm512_loadu_pd(x0 + 4 * n_elem_per_reg);
+            xv[5] = _mm512_loadu_pd(x0 + 5 * n_elem_per_reg);
+            xv[6] = _mm512_loadu_pd(x0 + 6 * n_elem_per_reg);
+            xv[7] = _mm512_loadu_pd(x0 + 7 * n_elem_per_reg);
+
+            yv[4] = _mm512_loadu_pd(y0 + 4 * n_elem_per_reg);
+            yv[5] = _mm512_loadu_pd(y0 + 5 * n_elem_per_reg);
+            yv[6] = _mm512_loadu_pd(y0 + 6 * n_elem_per_reg);
+            yv[7] = _mm512_loadu_pd(y0 + 7 * n_elem_per_reg);
+
+            yv[4] = _mm512_fmadd_pd(xv[4], alphav, yv[4]);
+            yv[5] = _mm512_fmadd_pd(xv[5], alphav, yv[5]);
+            yv[6] = _mm512_fmadd_pd(xv[6], alphav, yv[6]);
+            yv[7] = _mm512_fmadd_pd(xv[7], alphav, yv[7]);
+
+            _mm512_storeu_pd((y0 + 7 * n_elem_per_reg), yv[7]);
+            _mm512_storeu_pd((y0 + 6 * n_elem_per_reg), yv[6]);
+            _mm512_storeu_pd((y0 + 5 * n_elem_per_reg), yv[5]);
+            _mm512_storeu_pd((y0 + 4 * n_elem_per_reg), yv[4]);
+
+            x0 += 8 * n_elem_per_reg;
+            y0 += 8 * n_elem_per_reg;
+        }
+
+        for (; (i + 31) < n; i += 32)
+        {
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_pd(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_pd(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_pd(x0 + 3 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_pd(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_pd(y0 + 1 * n_elem_per_reg);
+            yv[2] = _mm512_loadu_pd(y0 + 2 * n_elem_per_reg);
+            yv[3] = _mm512_loadu_pd(y0 + 3 * n_elem_per_reg);
+
+            yv[0] = _mm512_fmadd_pd(xv[0], alphav, yv[0]);
+            yv[1] = _mm512_fmadd_pd(xv[1], alphav, yv[1]);
+            yv[2] = _mm512_fmadd_pd(xv[2], alphav, yv[2]);
+            yv[3] = _mm512_fmadd_pd(xv[3], alphav, yv[3]);
+
+            _mm512_storeu_pd((y0 + 0 * n_elem_per_reg), yv[0]);
+            _mm512_storeu_pd((y0 + 1 * n_elem_per_reg), yv[1]);
+            _mm512_storeu_pd((y0 + 2 * n_elem_per_reg), yv[2]);
+            _mm512_storeu_pd((y0 + 3 * n_elem_per_reg), yv[3]);
+
+            x0 += 4 * n_elem_per_reg;
+            y0 += 4 * n_elem_per_reg;
+        }
+
+        for (; (i + 15) < n; i += 16)
+        {
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_pd(x0 + 1 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_pd(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_pd(y0 + 1 * n_elem_per_reg);
+
+            yv[0] = _mm512_fmadd_pd(xv[0], alphav, yv[0]);
+            yv[1] = _mm512_fmadd_pd(xv[1], alphav, yv[1]);
+
+            _mm512_storeu_pd((y0 + 0 * n_elem_per_reg), yv[0]);
+            _mm512_storeu_pd((y0 + 1 * n_elem_per_reg), yv[1]);
+
+            x0 += 2 * n_elem_per_reg;
+            y0 += 2 * n_elem_per_reg;
+        }
+
+        for (; (i + 7) < n; i += 8)
+        {
+            xv[0] = _mm512_loadu_pd(x0);
+
+            yv[0] = _mm512_loadu_pd(y0);
+
+            yv[0] = _mm512_fmadd_pd(xv[0], alphav, yv[0]);
+
+            _mm512_storeu_pd(y0, yv[0]);
+
+            x0 += n_elem_per_reg;
+            y0 += n_elem_per_reg;
+        }
+
+        // This loop uses AVX2 instructions
+        for (; (i + 3) < n; i += 4)
+        {
+            __m256d x_vec = _mm256_loadu_pd(x0);
+
+            __m256d y_vec = _mm256_loadu_pd(y0);
+
+            y_vec = _mm256_fmadd_pd(x_vec, _mm256_set1_pd(*alpha), y_vec);
+
+            _mm256_storeu_pd(y0, y_vec);
+
+            x0 += 4;
+            y0 += 4;
+        }
+    }
+
+    /*
+        This loop has two functions:
+        1. Handles the remainder of n / 4 when incx and incy are 1.
+        2. Performs the complete compute when incx or incy != 1
+    */
+    for (; i < n; i += 1)
+    {
+        *y0 += (*alpha) * (*x0);
+
+        x0 += incx;
+        y0 += incy;
+    }
+
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_4)
+}
diff --git a/kernels/zen4/1/bli_dotv_zen_int_avx512.c b/kernels/zen4/1/bli_dotv_zen_int_avx512.c
new file mode 100644
index 0000000000..681e4bda5b
--- /dev/null
+++ b/kernels/zen4/1/bli_dotv_zen_int_avx512.c
@@ -0,0 +1,414 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2016 - 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+/*
+    Functionality
+    -------------
+
+    This function calculates the dot product of two vectors for
+    type float.
+
+    rho := conjx(x)^T * conjy(y)
+
+    Function Signature
+    -------------------
+
+    * 'conjx' - Variable specified if x needs to be conjugated
+    * 'conjy' - Variable specified if x needs to be conjugated
+    * 'n' - Length of the array passed
+    * 'x' - Float pointer pointing to an array
+    * 'y' - Float pointer pointing to an array
+    * 'incx' - Stride to point to the next element in x array
+    * 'incy' - Stride to point to the next element in y array
+    * 'cntx' - BLIS context object
+
+    Exception
+    ----------
+
+    None
+
+    Deviation from BLAS
+    --------------------
+
+    None
+
+    Undefined behaviour
+    -------------------
+
+    1. The kernel results in undefined behaviour when n <= 0, incx <= 1 and incy <= 1.
+       The expectation is that these are standard BLAS exceptions and should be handled in
+       a higher layer
+*/
+void bli_sdotv_zen_int_avx512
+     (
+       conj_t           conjx,
+       conj_t           conjy,
+       dim_t            n,
+       float*  restrict x, inc_t incx,
+       float*  restrict y, inc_t incy,
+       float*  restrict rho,
+       cntx_t* restrict cntx
+     )
+{
+    dim_t i = 0;
+
+    // Initialize local pointers.
+    float *restrict x0 = x;
+    float *restrict y0 = y;
+
+    float rho0 = 0.0f;
+
+    if (incx == 1 && incy == 1)
+    {
+        const dim_t n_elem_per_reg = 16;
+
+        __m512 xv[5];
+        __m512 yv[5];
+        __m512 rhov[5];
+
+        rhov[0] = _mm512_setzero_ps();
+        rhov[1] = _mm512_setzero_ps();
+        rhov[2] = _mm512_setzero_ps();
+        rhov[3] = _mm512_setzero_ps();
+        rhov[4] = _mm512_setzero_ps();
+
+        for (i = 0; (i + 79) < n; i += 80)
+        {
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_ps(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_ps(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_ps(x0 + 3 * n_elem_per_reg);
+            xv[4] = _mm512_loadu_ps(x0 + 4 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_ps(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_ps(y0 + 1 * n_elem_per_reg);
+            yv[2] = _mm512_loadu_ps(y0 + 2 * n_elem_per_reg);
+            yv[3] = _mm512_loadu_ps(y0 + 3 * n_elem_per_reg);
+            yv[4] = _mm512_loadu_ps(y0 + 4 * n_elem_per_reg);
+
+            rhov[0] = _mm512_fmadd_ps(xv[0], yv[0], rhov[0]);
+            rhov[1] = _mm512_fmadd_ps(xv[1], yv[1], rhov[1]);
+            rhov[2] = _mm512_fmadd_ps(xv[2], yv[2], rhov[2]);
+            rhov[3] = _mm512_fmadd_ps(xv[3], yv[3], rhov[3]);
+            rhov[4] = _mm512_fmadd_ps(xv[4], yv[4], rhov[4]);
+
+            x0 += 5 * n_elem_per_reg;
+            y0 += 5 * n_elem_per_reg;
+        }
+
+        for (; (i + 31) < n; i += 32)
+        {
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_ps(x0 + 1 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_ps(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_ps(y0 + 1 * n_elem_per_reg);
+
+            rhov[0] = _mm512_fmadd_ps(xv[0], yv[0], rhov[0]);
+            rhov[1] = _mm512_fmadd_ps(xv[1], yv[1], rhov[1]);
+
+            x0 += 2 * n_elem_per_reg;
+            y0 += 2 * n_elem_per_reg;
+        }
+
+        for (; (i + 15) < n; i += 16)
+        {
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_ps(y0 + 0 * n_elem_per_reg);
+
+            rhov[0] = _mm512_fmadd_ps(xv[0], yv[0], rhov[0]);
+
+            x0 += n_elem_per_reg;
+            y0 += n_elem_per_reg;
+        }
+
+        __m256 temp[2];
+        temp[0] = _mm256_setzero_ps();
+
+        for (; (i + 7) < n; i += 8)
+        {
+            __m256 x_vec = _mm256_loadu_ps(x0 + 0 * n_elem_per_reg);
+
+            __m256 y_vec = _mm256_loadu_ps(y0 + 0 * n_elem_per_reg);
+
+            temp[0] = _mm256_fmadd_ps(x_vec, y_vec, temp[0]);
+
+            x0 += 8;
+            y0 += 8;
+        }
+
+        __m128 temp_128[2];
+        temp_128[0] = _mm_setzero_ps();
+
+        for (; (i + 3) < n; i += 4)
+        {
+            __m128 x_vec = _mm_loadu_ps(x0 + 0 * n_elem_per_reg);
+
+            __m128 y_vec = _mm_loadu_ps(y0 + 0 * n_elem_per_reg);
+
+            temp_128[0] = _mm_fmadd_ps(x_vec, y_vec, temp_128[0]);
+
+            x0 += 4;
+            y0 += 4;
+        }
+
+        // Add the results from above to finish the sum.
+        rhov[0] = _mm512_add_ps(rhov[0], rhov[2]);
+        rhov[1] = _mm512_add_ps(rhov[1], rhov[3]);
+
+        rhov[0] = _mm512_add_ps(rhov[0], rhov[1]);
+        rhov[0] = _mm512_add_ps(rhov[0], rhov[4]);
+
+        temp[1] = _mm512_extractf32x8_ps(rhov[0], 0);
+        temp[0] = _mm256_add_ps(temp[0], temp[1]);
+
+        temp[1] = _mm512_extractf32x8_ps(rhov[0], 1);
+        temp[0] = _mm256_add_ps(temp[0], temp[1]);
+
+        temp_128[1] = _mm256_extractf32x4_ps(temp[0], 0);
+        temp_128[0] = _mm_add_ps(temp_128[0], temp_128[1]);
+        temp_128[1] = _mm256_extractf32x4_ps(temp[0], 1);
+        temp_128[0] = _mm_add_ps(temp_128[0], temp_128[1]);
+
+        rho0 = temp_128[0][0] + temp_128[0][1] + temp_128[0][2] + temp_128[0][3];
+    }
+
+    for (; i < n; ++i)
+    {
+        const float x0c = *x0;
+        const float y0c = *y0;
+
+        rho0 += x0c * y0c;
+
+        x0 += incx;
+        y0 += incy;
+    }
+
+    // Copy the final result into the output variable.
+    PASTEMAC(s, copys)(rho0, *rho);
+}
+
+// -----------------------------------------------------------------------------
+
+/*
+    Functionality
+    -------------
+
+    This function calculates the dot product of two vectors for
+    type double.
+
+    rho := conjx(x)^T * conjy(y)
+
+    Function Signature
+    -------------------
+
+    * 'conjx' - Variable specified if x needs to be conjugated
+    * 'conjy' - Variable specified if x needs to be conjugated
+    * 'n' - Length of the array passed
+    * 'x' - Double pointer pointing to an array
+    * 'y' - Double pointer pointing to an array
+    * 'incx' - Stride to point to the next element in x array
+    * 'incy' - Stride to point to the next element in y array
+    * 'cntx' - BLIS context object
+
+    Exception
+    ----------
+
+    None
+
+    Deviation from BLAS
+    --------------------
+
+    None
+
+    Undefined behaviour
+    -------------------
+
+    1. The kernel results in undefined behaviour when n <= 0, incx <= 1 and incy <= 1.
+       The expectation is that these are standard BLAS exceptions and should be handled in
+       a higher layer
+*/
+void bli_ddotv_zen_int_avx512
+     (
+       conj_t           conjx,
+       conj_t           conjy,
+       dim_t            n,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       double* restrict rho,
+       cntx_t* restrict cntx
+     )
+{
+    dim_t i = 0;
+
+    // Initialize local pointers.
+    double *restrict x0 = x;
+    double *restrict y0 = y;
+
+    double rho0 = 0.0;
+
+    if (incx == 1 && incy == 1)
+    {
+        const dim_t n_elem_per_reg = 8;
+
+        __m512d xv[5];
+        __m512d yv[5];
+        __m512d rhov[5];
+
+        rhov[0] = _mm512_setzero_pd();
+        rhov[1] = _mm512_setzero_pd();
+        rhov[2] = _mm512_setzero_pd();
+        rhov[3] = _mm512_setzero_pd();
+        rhov[4] = _mm512_setzero_pd();
+
+        for (i = 0; (i + 39) < n; i += 40)
+        {
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_pd(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_pd(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_pd(x0 + 3 * n_elem_per_reg);
+            xv[4] = _mm512_loadu_pd(x0 + 4 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_pd(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_pd(y0 + 1 * n_elem_per_reg);
+            yv[2] = _mm512_loadu_pd(y0 + 2 * n_elem_per_reg);
+            yv[3] = _mm512_loadu_pd(y0 + 3 * n_elem_per_reg);
+            yv[4] = _mm512_loadu_pd(y0 + 4 * n_elem_per_reg);
+
+            rhov[0] = _mm512_fmadd_pd(xv[0], yv[0], rhov[0]);
+            rhov[1] = _mm512_fmadd_pd(xv[1], yv[1], rhov[1]);
+            rhov[2] = _mm512_fmadd_pd(xv[2], yv[2], rhov[2]);
+            rhov[3] = _mm512_fmadd_pd(xv[3], yv[3], rhov[3]);
+            rhov[4] = _mm512_fmadd_pd(xv[4], yv[4], rhov[4]);
+
+            x0 += 5 * n_elem_per_reg;
+            y0 += 5 * n_elem_per_reg;
+        }
+
+        for (; (i + 15) < n; i += 16)
+        {
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_pd(x0 + 1 * n_elem_per_reg);
+
+            yv[0] = _mm512_loadu_pd(y0 + 0 * n_elem_per_reg);
+            yv[1] = _mm512_loadu_pd(y0 + 1 * n_elem_per_reg);
+
+            rhov[0] = _mm512_fmadd_pd(xv[0], yv[0], rhov[0]);
+            rhov[1] = _mm512_fmadd_pd(xv[1], yv[1], rhov[1]);
+
+            x0 += 2 * n_elem_per_reg;
+            y0 += 2 * n_elem_per_reg;
+        }
+
+        for (; (i + 7) < n; i += 8)
+        {
+            xv[0] = _mm512_loadu_pd(x0);
+
+            yv[0] = _mm512_loadu_pd(y0);
+
+            rhov[0] = _mm512_fmadd_pd(xv[0], yv[0], rhov[0]);
+
+            x0 += n_elem_per_reg;
+            y0 += n_elem_per_reg;
+        }
+
+        __m256d temp[2];
+        temp[0] = _mm256_setzero_pd();
+
+        for (; (i + 3) < n; i += 4)
+        {
+            __m256d x_vec = _mm256_loadu_pd(x0);
+
+            __m256d y_vec = _mm256_loadu_pd(y0);
+
+            temp[0] = _mm256_fmadd_pd(x_vec, y_vec, temp[0]);
+
+            x0 += 4;
+            y0 += 4;
+        }
+
+        __m128d temp_128[2];
+        temp_128[0] = _mm_setzero_pd();
+
+        for (; (i + 1) < n; i += 2)
+        {
+            __m128d x_vec = _mm_loadu_pd(x0 + 0 * n_elem_per_reg);
+
+            __m128d y_vec = _mm_loadu_pd(y0 + 0 * n_elem_per_reg);
+
+            temp_128[0] = _mm_fmadd_pd(x_vec, y_vec, temp_128[0]);
+
+            x0 += 2;
+            y0 += 2;
+        }
+
+        // Add the results from above to finish the sum.
+        rhov[0] = _mm512_add_pd(rhov[0], rhov[2]);
+        rhov[1] = _mm512_add_pd(rhov[1], rhov[3]);
+
+        rhov[0] = _mm512_add_pd(rhov[0], rhov[1]);
+        rhov[0] = _mm512_add_pd(rhov[0], rhov[4]);
+
+        temp[1] = _mm512_extractf64x4_pd(rhov[0], 0);
+        temp[0] = _mm256_add_pd(temp[0], temp[1]);
+
+        temp[1] = _mm512_extractf64x4_pd(rhov[0], 1);
+        temp[0] = _mm256_add_pd(temp[0], temp[1]);
+
+        temp_128[1] = _mm256_extractf64x2_pd(temp[0], 0);
+        temp_128[0] = _mm_add_pd(temp_128[0], temp_128[1]);
+        temp_128[1] = _mm256_extractf64x2_pd(temp[0], 1);
+        temp_128[0] = _mm_add_pd(temp_128[0], temp_128[1]);
+
+        rho0 = temp_128[0][0] + temp_128[0][1];
+    }
+
+    for (; i < n; ++i)
+    {
+        const double x0c = *x0;
+        const double y0c = *y0;
+
+        rho0 += x0c * y0c;
+
+        x0 += incx;
+        y0 += incy;
+    }
+
+    // Copy the final result into the output variable.
+    PASTEMAC(d, copys)(rho0, *rho);
+}
diff --git a/kernels/zen4/1/bli_scalv_zen_int_avx512.c b/kernels/zen4/1/bli_scalv_zen_int_avx512.c
new file mode 100644
index 0000000000..2dd355b268
--- /dev/null
+++ b/kernels/zen4/1/bli_scalv_zen_int_avx512.c
@@ -0,0 +1,419 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "blis.h"
+#include <immintrin.h>
+
+/*
+    Functionality
+    -------------
+
+    This function scales a single precision floating-point vector by an element of the
+    same type.
+
+    x := conjalpha(alpha) * x
+
+    Function Signature
+    -------------------
+
+    * 'conjalpha' - Variable specified if alpha needs to be conjugated
+    * 'n' - Length of the array passed
+    * 'alpha' - Pointer to the element by which the vector is to be scaled
+    * 'x' - Float pointer pointing to an array
+    * 'incx' - Stride to point to the next element in the array
+    * 'cntx' - BLIS context object
+
+    Exception
+    ----------
+
+    None
+
+    Deviation from BLAS
+    --------------------
+
+    None
+
+    Undefined behaviour
+    -------------------
+
+    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
+       is that these are standard BLAS exceptions and should be handled in a higher layer.
+*/
+void bli_sscalv_zen_int_avx512
+        (
+          conj_t conjalpha,
+          dim_t n,
+          float *restrict alpha,
+          float *restrict x, inc_t incx,
+          cntx_t *restrict cntx
+        )
+{
+    dim_t i = 0;
+    float *restrict x0 = x;
+
+    if (incx == 1)
+    {
+        // Number of float in AVX-512
+        const dim_t n_elem_per_reg = 16;
+
+        __m512 xv[8], alphav;
+        alphav = _mm512_set1_ps(*alpha);
+
+        for (i = 0; (i + 127) < n; i += 128)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_ps(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_ps(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_ps(x0 + 3 * n_elem_per_reg);
+            xv[4] = _mm512_loadu_ps(x0 + 4 * n_elem_per_reg);
+            xv[5] = _mm512_loadu_ps(x0 + 5 * n_elem_per_reg);
+            xv[6] = _mm512_loadu_ps(x0 + 6 * n_elem_per_reg);
+            xv[7] = _mm512_loadu_ps(x0 + 7 * n_elem_per_reg);
+
+            // perform : x := alpha * x;
+            xv[0] = _mm512_mul_ps(alphav, xv[0]);
+            xv[1] = _mm512_mul_ps(alphav, xv[1]);
+            xv[2] = _mm512_mul_ps(alphav, xv[2]);
+            xv[3] = _mm512_mul_ps(alphav, xv[3]);
+
+            _mm512_storeu_ps((x0 + 0 * n_elem_per_reg), xv[0]);
+            _mm512_storeu_ps((x0 + 1 * n_elem_per_reg), xv[1]);
+            _mm512_storeu_ps((x0 + 2 * n_elem_per_reg), xv[2]);
+            _mm512_storeu_ps((x0 + 3 * n_elem_per_reg), xv[3]);
+
+            xv[4] = _mm512_mul_ps(alphav, xv[4]);
+            xv[5] = _mm512_mul_ps(alphav, xv[5]);
+            xv[6] = _mm512_mul_ps(alphav, xv[6]);
+            xv[7] = _mm512_mul_ps(alphav, xv[7]);
+
+            _mm512_storeu_ps((x0 + 4 * n_elem_per_reg), xv[4]);
+            _mm512_storeu_ps((x0 + 5 * n_elem_per_reg), xv[5]);
+            _mm512_storeu_ps((x0 + 6 * n_elem_per_reg), xv[6]);
+            _mm512_storeu_ps((x0 + 7 * n_elem_per_reg), xv[7]);
+
+            x0 += 8 * n_elem_per_reg;
+        }
+
+        for (; (i + 63) < n; i += 64)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_ps(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_ps(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_ps(x0 + 3 * n_elem_per_reg);
+
+            // perform : x := alpha * x;
+            xv[0] = _mm512_mul_ps(alphav, xv[0]);
+            xv[1] = _mm512_mul_ps(alphav, xv[1]);
+            xv[2] = _mm512_mul_ps(alphav, xv[2]);
+            xv[3] = _mm512_mul_ps(alphav, xv[3]);
+
+            _mm512_storeu_ps((x0 + 0 * n_elem_per_reg), xv[0]);
+            _mm512_storeu_ps((x0 + 1 * n_elem_per_reg), xv[1]);
+            _mm512_storeu_ps((x0 + 2 * n_elem_per_reg), xv[2]);
+            _mm512_storeu_ps((x0 + 3 * n_elem_per_reg), xv[3]);
+
+            x0 += 4 * n_elem_per_reg;
+        }
+
+        for (; (i + 31) < n; i += 32)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_ps(x0 + 1 * n_elem_per_reg);
+
+            // perform : x := alpha * x;
+            xv[0] = _mm512_mul_ps(alphav, xv[0]);
+            xv[1] = _mm512_mul_ps(alphav, xv[1]);
+
+            _mm512_storeu_ps((x0 + 0 * n_elem_per_reg), xv[0]);
+            _mm512_storeu_ps((x0 + 1 * n_elem_per_reg), xv[1]);
+
+            x0 += 2 * n_elem_per_reg;
+        }
+
+        for (; (i + 15) < n; i += 16)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_ps(x0 + 0 * n_elem_per_reg);
+
+            // perform : x := alpha * x;
+            xv[0] = _mm512_mul_ps(alphav, xv[0]);
+
+            _mm512_storeu_ps((x0 + 0 * n_elem_per_reg), xv[0]);
+
+            x0 += n_elem_per_reg;
+        }
+
+        for (; (i + 7) < n; i += 8)
+        {
+            // Loading the input values
+            __m256 x_vec = _mm256_loadu_ps(x0);
+
+            // perform : x := alpha * x;
+            x_vec = _mm256_mul_ps(_mm256_set1_ps(*alpha), x_vec);
+
+            // Store the output.
+            _mm256_storeu_ps(x0, x_vec);
+
+            x0 += 8;
+        }
+
+        /*
+            Issue vzeroupper instruction to clear upper lanes of ymm registers.
+            This avoids a performance penalty caused by false dependencies when
+            transitioning from from AVX to SSE instructions (which may occur
+            later, especially if BLIS is compiled with -mfpmath=sse).
+        */
+        _mm256_zeroupper();
+
+        for (; (i + 3) < n; i += 4)
+        {
+            // Loading the input values
+            __m128 x_vec = _mm_loadu_ps(x0);
+
+            // perform : x := alpha * x;
+            x_vec = _mm_mul_ps(_mm_set1_ps(*alpha), x_vec);
+
+            // Store the output.
+            _mm_storeu_ps(x0, x_vec);
+
+            x0 += 4;
+        }
+    }
+
+    const float alphac = *alpha;
+
+    for (; i < n; ++i)
+    {
+        *x0 *= alphac;
+
+        x0 += incx;
+    }
+}
+
+// --------------------------------------------------------------------------------------
+
+/*
+    Functionality
+    -------------
+
+    This function scales a double precision floating-point vector by an element of the
+    same type.
+
+    x := conjalpha(alpha) * x
+
+    Function Signature
+    -------------------
+
+    * 'conjalpha' - Variable specified if alpha needs to be conjugated
+    * 'n' - Length of the array passed
+    * 'alpha' - Pointer to the element by which the vector is to be scaled
+    * 'x' - Double pointer pointing to an array
+    * 'incx' - Stride to point to the next element in the array
+    * 'cntx' - BLIS context object
+
+    Exception
+    ----------
+
+    None
+
+    Deviation from BLAS
+    --------------------
+
+    None
+
+    Undefined behaviour
+    -------------------
+
+    1. The kernel results in undefined behaviour when n <= 0 and incx <= 1. The expectation
+       is that these are standard BLAS exceptions and should be handled in a higher layer.
+*/
+void bli_dscalv_zen_int_avx512
+        (
+          conj_t conjalpha,
+          dim_t n,
+          double *restrict alpha,
+          double *restrict x, inc_t incx,
+          cntx_t *restrict cntx
+        )
+{
+    dim_t i = 0;
+    double *restrict x0;
+
+    // Initialize local pointers.
+    x0 = x;
+
+    if (incx == 1)
+    {
+        // Number of double in AVX-512
+        const dim_t n_elem_per_reg = 8;
+
+        __m512d alphav;
+        alphav = _mm512_set1_pd(*alpha);
+        __m512d xv[8];
+
+        for (i = 0; (i + 63) < n; i += 64)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_pd(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_pd(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_pd(x0 + 3 * n_elem_per_reg);
+            xv[4] = _mm512_loadu_pd(x0 + 4 * n_elem_per_reg);
+            xv[5] = _mm512_loadu_pd(x0 + 5 * n_elem_per_reg);
+            xv[6] = _mm512_loadu_pd(x0 + 6 * n_elem_per_reg);
+            xv[7] = _mm512_loadu_pd(x0 + 7 * n_elem_per_reg);
+
+            // perform : x := alpha * x;
+            xv[0] = _mm512_mul_pd(alphav, xv[0]);
+            xv[1] = _mm512_mul_pd(alphav, xv[1]);
+            xv[2] = _mm512_mul_pd(alphav, xv[2]);
+            xv[3] = _mm512_mul_pd(alphav, xv[3]);
+
+            _mm512_storeu_pd((x0 + 0 * n_elem_per_reg), xv[0]);
+            _mm512_storeu_pd((x0 + 1 * n_elem_per_reg), xv[1]);
+            _mm512_storeu_pd((x0 + 2 * n_elem_per_reg), xv[2]);
+            _mm512_storeu_pd((x0 + 3 * n_elem_per_reg), xv[3]);
+
+            xv[4] = _mm512_mul_pd(alphav, xv[4]);
+            xv[5] = _mm512_mul_pd(alphav, xv[5]);
+            xv[6] = _mm512_mul_pd(alphav, xv[6]);
+            xv[7] = _mm512_mul_pd(alphav, xv[7]);
+
+            _mm512_storeu_pd((x0 + 4 * n_elem_per_reg), xv[4]);
+            _mm512_storeu_pd((x0 + 5 * n_elem_per_reg), xv[5]);
+            _mm512_storeu_pd((x0 + 6 * n_elem_per_reg), xv[6]);
+            _mm512_storeu_pd((x0 + 7 * n_elem_per_reg), xv[7]);
+
+            x0 += 8 * n_elem_per_reg;
+        }
+
+        for (; (i + 31) < n; i += 32)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_pd(x0 + 1 * n_elem_per_reg);
+            xv[2] = _mm512_loadu_pd(x0 + 2 * n_elem_per_reg);
+            xv[3] = _mm512_loadu_pd(x0 + 3 * n_elem_per_reg);
+
+            // perform : x := alpha * x;
+            xv[0] = _mm512_mul_pd(alphav, xv[0]);
+            xv[1] = _mm512_mul_pd(alphav, xv[1]);
+            xv[2] = _mm512_mul_pd(alphav, xv[2]);
+            xv[3] = _mm512_mul_pd(alphav, xv[3]);
+
+            _mm512_storeu_pd((x0 + 0 * n_elem_per_reg), xv[0]);
+            _mm512_storeu_pd((x0 + 1 * n_elem_per_reg), xv[1]);
+            _mm512_storeu_pd((x0 + 2 * n_elem_per_reg), xv[2]);
+            _mm512_storeu_pd((x0 + 3 * n_elem_per_reg), xv[3]);
+
+            x0 += 4 * n_elem_per_reg;
+        }
+
+        for (; (i + 15) < n; i += 16)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+            xv[1] = _mm512_loadu_pd(x0 + 1 * n_elem_per_reg);
+
+            // perform : x := alpha * x;
+            xv[0] = _mm512_mul_pd(alphav, xv[0]);
+            xv[1] = _mm512_mul_pd(alphav, xv[1]);
+
+            _mm512_storeu_pd((x0 + 0 * n_elem_per_reg), xv[0]);
+            _mm512_storeu_pd((x0 + 1 * n_elem_per_reg), xv[1]);
+
+            x0 += 2 * n_elem_per_reg;
+        }
+
+        for (; (i + 7) < n; i += 8)
+        {
+            // Loading the input values
+            xv[0] = _mm512_loadu_pd(x0 + 0 * n_elem_per_reg);
+
+            // perform : x := alpha * x;
+            xv[0] = _mm512_mul_pd(alphav, xv[0]);
+
+            _mm512_storeu_pd((x0 + 0 * n_elem_per_reg), xv[0]);
+
+            x0 += n_elem_per_reg;
+        }
+
+        for (; (i + 3) < n; i += 4)
+        {
+            // Loading the input values
+            __m256d x_vec = _mm256_loadu_pd(x0);
+
+            // perform : x := alpha * x;
+            x_vec = _mm256_mul_pd(_mm256_set1_pd(*alpha), x_vec);
+
+            // Store the output.
+            _mm256_storeu_pd(x0, x_vec);
+
+            x0 += 4;
+        }
+
+        /*
+           Issue vzeroupper instruction to clear upper lanes of ymm registers.
+           This avoids a performance penalty caused by false dependencies when
+           transitioning from from AVX to SSE instructions (which may occur
+           later, especially if BLIS is compiled with -mfpmath=sse).
+       */
+        _mm256_zeroupper();
+
+        for (; (i + 1) < n; i += 2)
+        {
+            // Loading the input values
+            __m128d x_vec = _mm_loadu_pd(x0);
+
+            // perform : x := alpha * x;
+            x_vec = _mm_mul_pd(_mm_set1_pd(*alpha), x_vec);
+
+            // Store the output.
+            _mm_storeu_pd(x0, x_vec);
+
+            x0 += 2;
+        }
+    }
+
+    const double alphac = *alpha;
+
+    for (; i < n; ++i)
+    {
+        *x0 *= alphac;
+
+        x0 += incx;
+    }
+}
diff --git a/kernels/zen4/1m/CMakeLists.txt b/kernels/zen4/1m/CMakeLists.txt
new file mode 100644
index 0000000000..9dfbefc458
--- /dev/null
+++ b/kernels/zen4/1m/CMakeLists.txt
@@ -0,0 +1,16 @@
+##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.##
+
+add_library(zen4_1m
+     OBJECT
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d8xk.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d16xk.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d24xk.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_d32xk.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_z12xk.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_packm_zen4_asm_z4xk.c
+ )
+
+target_compile_options(zen4_1m PRIVATE  /U__PRFCHW__ /arch:AVX2 /arch:AVX512)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen4_1m PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d16xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d16xk.c
new file mode 100644
index 0000000000..5ecc5403f7
--- /dev/null
+++ b/kernels/zen4/1m/bli_packm_zen4_asm_d16xk.c
@@ -0,0 +1,252 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <x86intrin.h>
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+
+void bli_dpackm_zen4_asm_16xk
+     (
+       conj_t              conja,
+       pack_t              schema,
+       dim_t               cdim0,
+       dim_t               k0,
+       dim_t               k0_max,
+       double*    restrict kappa,
+       double*    restrict a, inc_t inca0, inc_t lda0,
+       double*    restrict p,              inc_t ldp0,
+       cntx_t*    restrict cntx
+     )
+{
+#if 0
+	bli_dpackm_16xk_zen4_ref
+	(
+	  conja, schema, cdim0, k0, k0_max,
+	  kappa, a, inca0, lda0, p, ldp0, cntx
+	);
+	return;
+#endif
+
+	// This is the panel dimension assumed by the packm kernel.
+	const dim_t      mnr   = 16;
+
+	// This is the "packing" dimension assumed by the packm kernel.
+	// This should be equal to ldp.
+	//const dim_t    packmnr = 16;
+
+	// NOTE: For the purposes of the comments in this packm kernel, we
+	// interpret inca and lda as rs_a and cs_a, respectively, and similarly
+	// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
+	// this packm kernel, you should think of the operation as packing an
+	// m x n micropanel, where m and n are tiny and large, respectively, and
+	// where elements of each column of the packed matrix P are contiguous.
+	// (This packm kernel can still be used to pack micropanels of matrix B
+	// in a gemm operation.)
+	const uint64_t inca   = inca0;
+	const uint64_t lda    = lda0;
+	const uint64_t ldp    = ldp0;
+
+	// NOTE: If/when this kernel ever supports scaling by kappa within the
+	// assembly region, this constraint should be lifted.
+	const bool     unitk  = bli_deq1( *kappa );
+
+
+	// -------------------------------------------------------------------------
+
+	if ( cdim0 == mnr )
+	{
+		if ( unitk )
+		{
+			if ( bli_is_conj( conja ) )
+			{
+				if ( inca == 1 )
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 16 ; i++ ) {
+							bli_dcopyjs( *(a + i), *(p + i) );
+						}
+						a += lda;
+						p    += ldp;
+					}
+				}
+				else
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 16 ; i++ ) {
+							bli_dcopyjs( *(a + i*inca), *(p + i) );
+						}
+						a += lda;
+						p    += ldp;
+					}
+				}
+			}
+			else
+			{
+				if ( inca == 1 )
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						_mm_prefetch( a + (8*lda), _MM_HINT_T0 );
+						for ( dim_t i = 0 ; i < 16 ; i++ ) {
+							bli_dcopys( *(a + i), *(p + i) );
+						}
+						a += lda;
+						p    += ldp;
+					}
+				}
+				else
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 16 ; i++ ) {
+							bli_dcopys( *(a + i*inca), *(p + i) );
+						}
+						a += lda;
+						p    += ldp;
+					}
+				}
+			}
+		}
+		else
+		{
+			if ( bli_is_conj( conja ) )
+			{
+				if ( inca == 1 )
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 16 ; i++ ) {
+							bli_dscal2js( *kappa, *(a + i), *(p + i) );
+						}
+						a += lda;
+						p    += ldp;
+					}
+				}
+				else
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 16 ; i++ ) {
+							bli_dscal2js( *kappa, *(a + i*inca), *(p + i) );
+						}
+						a += lda;
+						p    += ldp;
+					}
+				}
+			}
+			else
+			{
+				if ( inca == 1 )
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 16 ; i++ ) {
+							bli_dscal2s( *kappa, *(a + i), *(p + i) );
+						}
+						a += lda;
+						p    += ldp;
+					}
+				}
+				else
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 16 ; i++ ) {
+							bli_dscal2s( *kappa, *(a + i*inca), *(p + i) );
+						}
+						a += lda;
+						p    += ldp;
+					}
+				}
+			}
+		}
+	}
+	else // if ( cdim0 < mnr )
+	{
+		PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
+		(
+		  0,
+		  BLIS_NONUNIT_DIAG,
+		  BLIS_DENSE,
+		  ( trans_t )conja,
+		  cdim0,
+		  k0,
+		  kappa,
+		  a, inca0, lda0,
+		  p,     1, ldp0,
+		  cntx,
+		  NULL
+		);
+
+		if ( cdim0 < mnr )
+		{
+			// Handle zero-filling along the "long" edge of the micropanel.
+
+			const dim_t      i      = cdim0;
+			const dim_t      m_edge = mnr - cdim0;
+			const dim_t      n_edge = k0_max;
+			double* restrict p_edge = p + (i  )*1;
+
+			bli_dset0s_mxn
+			(
+			  m_edge,
+			  n_edge,
+			  p_edge, 1, ldp
+			);
+		}
+	}
+
+	if ( k0 < k0_max )
+	{
+		// Handle zero-filling along the "short" (far) edge of the micropanel.
+
+		const dim_t      j      = k0;
+		const dim_t      m_edge = mnr;
+		const dim_t      n_edge = k0_max - k0;
+		double* restrict p_edge = p + (j  )*ldp;
+
+		bli_dset0s_mxn
+		(
+		  m_edge,
+		  n_edge,
+		  p_edge, 1, ldp
+		);
+	}
+}
+
diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c
new file mode 100644
index 0000000000..ee9e128e41
--- /dev/null
+++ b/kernels/zen4/1m/bli_packm_zen4_asm_d24xk.c
@@ -0,0 +1,787 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+
+void bli_dpackm_zen4_asm_24xk
+     (
+       conj_t              conja,
+       pack_t              schema,
+       dim_t               cdim0,
+       dim_t               k0,
+       dim_t               k0_max,
+       double*    restrict kappa,
+       double*    restrict a, inc_t inca0, inc_t lda0,
+       double*    restrict p,              inc_t ldp0,
+       cntx_t*    restrict cntx
+     )
+{
+
+	// This is the panel dimension assumed by the packm kernel.
+	const dim_t      mnr   = 24;
+
+	// This is the "packing" dimension assumed by the packm kernel.
+	// This should be equal to ldp.
+	//const dim_t    packmnr = 8;
+
+	// Typecast local copies of integers in case dim_t and inc_t are a
+	// different size than is expected by load instructions.
+	const uint64_t k_iter = k0 / 8;
+
+	/**
+	 * prepares mask for k_left, since we are computing in multiple of 8,
+	 * for edge cases mask is initialized for loading and storing only
+	 * left over elements.
+	 */
+	const uint64_t k_left = k0 % 8;
+	uint8_t mask = 0xff >> (0x8 - (k_left & 7));
+	if (mask == 0) mask = 0xff;
+
+	// NOTE: For the purposes of the comments in this packm kernel, we
+	// interpret inca and lda as rs_a and cs_a, respectively, and similarly
+	// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
+	// this packm kernel, you should think of the operation as packing an
+	// m x n micropanel, where m and n are tiny and large, respectively, and
+	// where elements of each column of the packed matrix P are contiguous.
+	// (This packm kernel can still be used to pack micropanels of matrix B
+	// in a gemm operation.)
+	const uint64_t inca   = inca0;
+	const uint64_t lda    = lda0;
+	const uint64_t ldp    = ldp0;
+
+	const bool     gs     = ( inca0 != 1 && lda0 != 1 );
+
+	// NOTE: If/when this kernel ever supports scaling by kappa within the
+	// assembly region, this constraint should be lifted.
+	const bool     unitk  = bli_deq1( *kappa );
+
+	double* restrict a_next = a + cdim0;
+	// -------------------------------------------------------------------------
+
+	if ( cdim0 == mnr && !gs && unitk )
+	{
+		begin_asm()
+		mov(var(mask), rdx)                // load mask
+		kmovw(edx, k(2))                   // move mask to k2 register
+		mov(var(a), rax)                   // load address of source buffer.
+		mov(var(a), r13)                   // load address of source buffer.
+		mov(var(inca), r8)                 // load inca
+		mov(var(lda), r10)                 // load lda
+		lea(mem(, r8,  8), r8)             // inca *= sizeof(double)
+		lea(mem(, r10, 8), r10)            // lda *= sizeof(double)
+
+		mov(var(p), rbx)                   // load address of destination buffer.
+
+		lea(mem(   , r8, 8), r15)          // r15 = 8*inc0
+
+		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
+		jz(.DCOLUNIT)                      // jump to column storage case
+
+
+		//kappa unit case
+		//Source buffer is row stored
+
+		label(.DROWUNIT)
+
+		lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
+		lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca
+		lea(mem(r12, r8,  4), rdx)         // rdx = 7*inca
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DCONKLEFTROWU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+		label(.DKITERROWU)                 // MAIN LOOP (k_iter)
+
+//             Source Buffer                       Destination buffer(packed matrix)
+//             24                                  K
+//      _____________________________________           _________________________________________
+//     | 0  1  2  3  4  5  6  7 *{1}*{2}*{3} |         | 0  8 10 88 20 A0 21 31                  |
+//     | 8  9  A  B  C  D  E  F *...*...*... |         | 1  9 11 99 30 B0 22 32     ....         |
+//     |10 11 22 33 44 55 66 77 *{8}*{8}*{8} |         | 2  A 22 AA 40 C0 23 33  *[next k_iter]* |
+//     |88 99 AA BB CC DD EE FF *{x}*{x}*{x} |         | 3  B 33 BB 50 D0 24 34     ....         |
+//     |20 30 40 50 60 70 80 90 *{8}*{8}*{8} |         | 4  C 44 CC 60 E0 25 35                  |
+//  K  |A0 B0 C0 D0 E0 F0 G0 H0 *  .*...*... |  =>  24 | 5  D 55 DD 70 F0 26 36                  |
+//     |21 22 23 24 25 26 27 28 *{t}*{t}*{t} |         | 6  E 66 EE 80 G0 27 37                  |
+//     |31 32 33 34 35 36 37 38 *{i}*{i}*{i} |         | 7  F 77 FF 90 H0 28 38                  |
+//     |   -                    *{l}*{l}*{l} |         |        ****1 8x8 tile****               |
+//     |             .          *{e}*{e}*{e} |         |          .                              |
+//     |                        *   *   *    |         |        ****2 8x8 tile****               |
+//     |             .          *   *   *    |         |          .                              |
+//     |                        *   *   *    |         |        ****3 8x8 tile****               |
+//     |        [next k_iter]   *   *   *    |         |_________________________________________|
+//     |             .          *   *   *    |
+//     |             .          *   *   *    |
+//     |_____________________________________|
+
+		/**
+		 * Accesses source and destination buffer in following manner
+		 * (source_buffer(rax) + i*inca), *(destination_buffer(rbx) + i)
+		 * where i is updated by 1 and rax and rbx updated by lda and ldp.
+		*/
+
+		/**
+		 * Load first 8 rows of matrix.
+		 * Transpose 8x8 tile and store it back to destination buffer.
+		 */
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,  r8, 1, 0), zmm8)
+		vmovupd(mem(rax,  r8, 2, 0), zmm10)
+		vmovupd(mem(rax, r12, 1, 0), zmm12)
+		vmovupd(mem(rax,  r8, 4, 0), zmm14)
+		vmovupd(mem(rax, rcx, 1, 0), zmm16)
+		vmovupd(mem(rax, r12, 2, 0), zmm18)
+		vmovupd(mem(rax, rdx, 1, 0), zmm20)
+
+		UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+		SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+		UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+		SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+		SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+		SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+		vmovupd(zmm0, mem(rbx, 0*192))
+		vmovupd(zmm4, mem(rbx, 1*192))
+		vmovupd(zmm2, mem(rbx, 2*192))
+		vmovupd(zmm6, mem(rbx, 3*192))
+		vmovupd(zmm1, mem(rbx, 4*192))
+		vmovupd(zmm5, mem(rbx, 5*192))
+		vmovupd(zmm3, mem(rbx, 6*192))
+		vmovupd(zmm8, mem(rbx, 7*192))
+
+		add(r15, rax)
+
+		/**
+		 * Load another 8 rows of matrix.
+		 * Transpose 8x8 tile and store it back to destination buffer.
+		 */
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,  r8, 1, 0), zmm8)
+		vmovupd(mem(rax,  r8, 2, 0), zmm10)
+		vmovupd(mem(rax, r12, 1, 0), zmm12)
+		vmovupd(mem(rax,  r8, 4, 0), zmm14)
+		vmovupd(mem(rax, rcx, 1, 0), zmm16)
+		vmovupd(mem(rax, r12, 2, 0), zmm18)
+		vmovupd(mem(rax, rdx, 1, 0), zmm20)
+
+		UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+		SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+		UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+		SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+		SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+		SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+		vmovupd(zmm0, mem(rbx, 0*192 + 64))
+		vmovupd(zmm4, mem(rbx, 1*192 + 64))
+		vmovupd(zmm2, mem(rbx, 2*192 + 64))
+		vmovupd(zmm6, mem(rbx, 3*192 + 64))
+		vmovupd(zmm1, mem(rbx, 4*192 + 64))
+		vmovupd(zmm5, mem(rbx, 5*192 + 64))
+		vmovupd(zmm3, mem(rbx, 6*192 + 64))
+		vmovupd(zmm8, mem(rbx, 7*192 + 64))
+
+		add(r15, rax)
+
+		/**
+		 * Load another 8 rows of matrix.
+		 * Transpose 8x8 tile and store it back to destination buffer.
+		 */
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,  r8, 1, 0), zmm8)
+		vmovupd(mem(rax,  r8, 2, 0), zmm10)
+		vmovupd(mem(rax, r12, 1, 0), zmm12)
+		vmovupd(mem(rax,  r8, 4, 0), zmm14)
+		vmovupd(mem(rax, rcx, 1, 0), zmm16)
+		vmovupd(mem(rax, r12, 2, 0), zmm18)
+		vmovupd(mem(rax, rdx, 1, 0), zmm20)
+
+		UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+		SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+		UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+		SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+		SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+		SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+		vmovupd(zmm0, mem(rbx, 0*192 + 128))
+		vmovupd(zmm4, mem(rbx, 1*192 + 128))
+		vmovupd(zmm2, mem(rbx, 2*192 + 128))
+		vmovupd(zmm6, mem(rbx, 3*192 + 128))
+		vmovupd(zmm1, mem(rbx, 4*192 + 128))
+		vmovupd(zmm5, mem(rbx, 5*192 + 128))
+		vmovupd(zmm3, mem(rbx, 6*192 + 128))
+		vmovupd(zmm8, mem(rbx, 7*192 + 128))
+
+		add(imm(8*8), r13)
+		mov(r13, rax)
+		add(imm(8*8*24), rbx)              // p += 8*ldp
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKITERROWU)                   // iterate again if i != 0.
+
+		label(.DCONKLEFTROWU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+		label(.DKLEFTROWU)                 // EDGE LOOP (k_left)
+
+		vmovupd(mem(rax,         0), zmm6 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 1, 0), zmm8 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 2, 0), zmm10 MASK_KZ(2))
+		vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 4, 0), zmm14 MASK_KZ(2))
+		vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_KZ(2))
+		vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_KZ(2))
+		vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_KZ(2))
+
+		UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+		SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+		UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+		SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+		SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+		SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+		add(r15, rax)
+		cmp(imm(7), rsi)
+		JZ(.UPDATE7L1)
+		cmp(imm(6), rsi)
+		JZ(.UPDATE6L1)
+		cmp(imm(5), rsi)
+		JZ(.UPDATE5L1)
+		cmp(imm(4), rsi)
+		JZ(.UPDATE4L1)
+		cmp(imm(3), rsi)
+		JZ(.UPDATE3L1)
+		cmp(imm(2), rsi)
+		JZ(.UPDATE2L1)
+		cmp(imm(1), rsi)
+		JZ(.UPDATE1L1)
+		cmp(imm(0), rsi)
+		JZ(.UPDATEDONE)
+
+		LABEL(.UPDATE7L1)
+		//Update 8x7 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192))
+		vmovupd(zmm4, mem(rbx, 1*192))
+		vmovupd(zmm2, mem(rbx, 2*192))
+		vmovupd(zmm6, mem(rbx, 3*192))
+		vmovupd(zmm1, mem(rbx, 4*192))
+		vmovupd(zmm5, mem(rbx, 5*192))
+		vmovupd(zmm3, mem(rbx, 6*192))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE6L1)
+		//Update 8x6 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192))
+		vmovupd(zmm4, mem(rbx, 1*192))
+		vmovupd(zmm2, mem(rbx, 2*192))
+		vmovupd(zmm6, mem(rbx, 3*192))
+		vmovupd(zmm1, mem(rbx, 4*192))
+		vmovupd(zmm5, mem(rbx, 5*192))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE5L1)
+		//Update 8x5 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192))
+		vmovupd(zmm4, mem(rbx, 1*192))
+		vmovupd(zmm2, mem(rbx, 2*192))
+		vmovupd(zmm6, mem(rbx, 3*192))
+		vmovupd(zmm1, mem(rbx, 4*192))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE4L1)
+		//Update 8x4 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192))
+		vmovupd(zmm4, mem(rbx, 1*192))
+		vmovupd(zmm2, mem(rbx, 2*192))
+		vmovupd(zmm6, mem(rbx, 3*192))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE3L1)
+		//Update 8x3 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192))
+		vmovupd(zmm4, mem(rbx, 1*192))
+		vmovupd(zmm2, mem(rbx, 2*192))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE2L1)
+		//Update 8x2 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192))
+		vmovupd(zmm4, mem(rbx, 1*192))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE1L1)
+		//Update 8x1 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATEDONE)
+
+		vmovupd(mem(rax,         0), zmm6 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 1, 0), zmm8 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 2, 0), zmm10 MASK_KZ(2))
+		vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 4, 0), zmm14 MASK_KZ(2))
+		vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_KZ(2))
+		vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_KZ(2))
+		vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_KZ(2))
+
+		UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+		SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+		UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+		SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+		SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+		SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+		add(r15, rax)
+
+		cmp(imm(7), rsi)
+		JZ(.UPDATE7L2)
+		cmp(imm(6), rsi)
+		JZ(.UPDATE6L2)
+		cmp(imm(5), rsi)
+		JZ(.UPDATE5L2)
+		cmp(imm(4), rsi)
+		JZ(.UPDATE4L2)
+		cmp(imm(3), rsi)
+		JZ(.UPDATE3L2)
+		cmp(imm(2), rsi)
+		JZ(.UPDATE2L2)
+		cmp(imm(1), rsi)
+		JZ(.UPDATE1L2)
+		cmp(imm(0), rsi)
+		JZ(.UPDATEDONEL2)
+
+		LABEL(.UPDATE7L2)
+		//Update 8x7 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 64))
+		vmovupd(zmm4, mem(rbx, 1*192 + 64))
+		vmovupd(zmm2, mem(rbx, 2*192 + 64))
+		vmovupd(zmm6, mem(rbx, 3*192 + 64))
+		vmovupd(zmm1, mem(rbx, 4*192 + 64))
+		vmovupd(zmm5, mem(rbx, 5*192 + 64))
+		vmovupd(zmm3, mem(rbx, 6*192 + 64))
+		jmp(.UPDATEDONEL2)
+
+		LABEL(.UPDATE6L2)
+		//Update 8x6 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 64))
+		vmovupd(zmm4, mem(rbx, 1*192 + 64))
+		vmovupd(zmm2, mem(rbx, 2*192 + 64))
+		vmovupd(zmm6, mem(rbx, 3*192 + 64))
+		vmovupd(zmm1, mem(rbx, 4*192 + 64))
+		vmovupd(zmm5, mem(rbx, 5*192 + 64))
+		jmp(.UPDATEDONEL2)
+
+		LABEL(.UPDATE5L2)
+		//Update 8x5 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 64))
+		vmovupd(zmm4, mem(rbx, 1*192 + 64))
+		vmovupd(zmm2, mem(rbx, 2*192 + 64))
+		vmovupd(zmm6, mem(rbx, 3*192 + 64))
+		vmovupd(zmm1, mem(rbx, 4*192 + 64))
+		jmp(.UPDATEDONEL2)
+
+		LABEL(.UPDATE4L2)
+		//Update 8x4 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 64))
+		vmovupd(zmm4, mem(rbx, 1*192 + 64))
+		vmovupd(zmm2, mem(rbx, 2*192 + 64))
+		vmovupd(zmm6, mem(rbx, 3*192 + 64))
+		jmp(.UPDATEDONEL2)
+
+		LABEL(.UPDATE3L2)
+		//Update 8x3 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 64))
+		vmovupd(zmm4, mem(rbx, 1*192 + 64))
+		vmovupd(zmm2, mem(rbx, 2*192 + 64))
+		jmp(.UPDATEDONEL2)
+
+		LABEL(.UPDATE2L2)
+		//Update 8x2 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 64))
+		vmovupd(zmm4, mem(rbx, 1*192 + 64))
+		jmp(.UPDATEDONEL2)
+
+		LABEL(.UPDATE1L2)
+		//Update 8x1 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 64))
+		jmp(.UPDATEDONEL2)
+
+		LABEL(.UPDATEDONEL2)
+
+		vmovupd(mem(rax,         0), zmm6 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 1, 0), zmm8 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 2, 0), zmm10 MASK_KZ(2))
+		vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 4, 0), zmm14 MASK_KZ(2))
+		vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_KZ(2))
+		vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_KZ(2))
+		vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_KZ(2))
+
+		UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+		SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+		UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+		SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+		SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+		SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+		cmp(imm(7), rsi)
+		JZ(.UPDATE7L3)
+		cmp(imm(6), rsi)
+		JZ(.UPDATE6L3)
+		cmp(imm(5), rsi)
+		JZ(.UPDATE5L3)
+		cmp(imm(4), rsi)
+		JZ(.UPDATE4L3)
+		cmp(imm(3), rsi)
+		JZ(.UPDATE3L3)
+		cmp(imm(2), rsi)
+		JZ(.UPDATE2L3)
+		cmp(imm(1), rsi)
+		JZ(.UPDATE1L3)
+		cmp(imm(0), rsi)
+		JZ(.UPDATEDONEL3)
+
+		LABEL(.UPDATE7L3)
+		//Update 8x7 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 128))
+		vmovupd(zmm4, mem(rbx, 1*192 + 128))
+		vmovupd(zmm2, mem(rbx, 2*192 + 128))
+		vmovupd(zmm6, mem(rbx, 3*192 + 128))
+		vmovupd(zmm1, mem(rbx, 4*192 + 128))
+		vmovupd(zmm5, mem(rbx, 5*192 + 128))
+		vmovupd(zmm3, mem(rbx, 6*192 + 128))
+		jmp(.UPDATEDONEL3)
+
+		LABEL(.UPDATE6L3)
+		//Update 8x6 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 128))
+		vmovupd(zmm4, mem(rbx, 1*192 + 128))
+		vmovupd(zmm2, mem(rbx, 2*192 + 128))
+		vmovupd(zmm6, mem(rbx, 3*192 + 128))
+		vmovupd(zmm1, mem(rbx, 4*192 + 128))
+		vmovupd(zmm5, mem(rbx, 5*192 + 128))
+		jmp(.UPDATEDONEL3)
+
+		LABEL(.UPDATE5L3)
+		//Update 8x5 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 128))
+		vmovupd(zmm4, mem(rbx, 1*192 + 128))
+		vmovupd(zmm2, mem(rbx, 2*192 + 128))
+		vmovupd(zmm6, mem(rbx, 3*192 + 128))
+		vmovupd(zmm1, mem(rbx, 4*192 + 128))
+		jmp(.UPDATEDONEL3)
+
+		LABEL(.UPDATE4L3)
+		//Update 8x4 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 128))
+		vmovupd(zmm4, mem(rbx, 1*192 + 128))
+		vmovupd(zmm2, mem(rbx, 2*192 + 128))
+		vmovupd(zmm6, mem(rbx, 3*192 + 128))
+		jmp(.UPDATEDONEL3)
+
+		LABEL(.UPDATE3L3)
+		//Update 8x3 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 128))
+		vmovupd(zmm4, mem(rbx, 1*192 + 128))
+		vmovupd(zmm2, mem(rbx, 2*192 + 128))
+		jmp(.UPDATEDONEL3)
+
+		LABEL(.UPDATE2L3)
+		//Update 8x2 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192 + 128) )
+		vmovupd(zmm4, mem(rbx, 1*192 + 128) )
+		jmp(.UPDATEDONEL3)
+
+		LABEL(.UPDATE1L3)
+		//Update 8x1 tile to destination buffer
+		vmovupd(zmm0, mem(rbx, 0*192  + 128))
+		jmp(.UPDATEDONEL3)
+
+		LABEL(.UPDATEDONEL3)
+		jmp(.DDONE)                        // jump to end.
+
+		//kappa unit case
+		//source buffer is column stored.
+		label(.DCOLUNIT)
+		mov(var(ldp), r8)                  // load lda
+		lea(mem(, r8,  8), r8)             // inca *= sizeof(double)
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DCONKLEFTCOLU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.DKITERCOLU)                 // MAIN LOOP (k_iter)
+
+//    Source buffer                                       Destination buffer(packed matrix)
+//    K                                                   K
+//     _________________________________________           _________________________________________
+//    | 0  8 10 88 20 A0 21 31                  |          | 0  8 10 88 20 A0 21 31                  |
+//    | 1  9 11 99 30 B0 22 32     ....         |          | 1  9 11 99 30 B0 22 32     ....         |
+//    | 2  A 22 AA 40 C0 23 33  *[next k_iter]* |          | 2  A 22 AA 40 C0 23 33  *[next k_iter]* |
+//    | 3  B 33 BB 50 D0 24 34     ....         |          | 3  B 33 BB 50 D0 24 34     ....         |
+//    | 4  C 44 CC 60 E0 25 35                  |          | 4  C 44 CC 60 E0 25 35                  |
+// 24 | 5  D 55 DD 70 F0 26 36                  |   =>  24 | 5  D 55 DD 70 F0 26 36                  |
+//    | 6  E 66 EE 80 G0 27 37                  |          | 6  E 66 EE 80 G0 27 37                  |
+//    | 7  F 77 FF 90 H0 28 38                  |          | 7  F 77 FF 90 H0 28 38                  |
+//    |        ****1 8x8 tile****               |          |        ****1 8x8 tile****               |
+//    |          .                              |          |          .                              |
+//    |        ****2 8x8 tile****               |          |        ****2 8x8 tile****               |
+//    |          .                              |          |          .                              |
+//    |        ****3 8x8 tile****               |          |        ****3 8x8 tile****               |
+//    |_________________________________________|          |_________________________________________|
+//
+		/**
+		 * Accesses source and destination buffer in following manner
+		 * (source_buffer(rax) + i), *(destination_buffer(rbx) + i)
+		 * where i is updated by 1 and rax and rbx updated by lda and ldp.
+		*/
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKITERCOLU)                   // iterate again if i != 0.
+
+		label(.DCONKLEFTCOLU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+		label(.DKLEFTCOLU)                 // EDGE LOOP (k_left)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,         64), zmm8)
+		vmovupd(mem(rax,        128), zmm10)
+		vmovupd(zmm6, mem(rbx,  0*64+ 0))
+		vmovupd(zmm8, mem(rbx,  0*64+ 64))
+		vmovupd(zmm10, mem(rbx, 0*64+ 128))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKLEFTCOLU)                   // iterate again if i != 0.
+		label(.DDONE)
+
+		end_asm(
+		: // output operands (none)
+		: // input operands
+		  [mask] "m" (mask),
+		  [k_iter] "m" (k_iter),
+		  [k_left] "m" (k_left),
+		  [a]      "m" (a),
+		  [inca]   "m" (inca),
+		  [lda]    "m" (lda),
+		  [p]      "m" (p),
+		  [ldp]    "m" (ldp),
+		  [a_next] "m" (a_next)
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi",
+		  "r8", "r10", "r12", "r13", "r15",
+		  "zmm0", "zmm1", "zmm2", "zmm3",
+		  "zmm4", "zmm5", "zmm6", "zmm7",
+		  "zmm8", "zmm9", "zmm10", "zmm11",
+		  "zmm12", "zmm13", "zmm14", "zmm15",
+		  "zmm16", "zmm18", "zmm20", "zmm30", "zmm31", "k2", "memory"
+		)
+	}
+	else // if ( cdim0 < mnr || gs || !unitk )
+	{
+		PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
+		(
+		  0,
+		  BLIS_NONUNIT_DIAG,
+		  BLIS_DENSE,
+		  ( trans_t )conja,
+		  cdim0,
+		  k0,
+		  kappa,
+		  a, inca0, lda0,
+		  p,     1, ldp0,
+		  cntx,
+		  NULL
+		);
+
+		if ( cdim0 < mnr )
+		{
+			// Handle zero-filling along the "long" edge of the micropanel.
+
+			const dim_t      i      = cdim0;
+			const dim_t      m_edge = mnr - cdim0;
+			const dim_t      n_edge = k0_max;
+			double* restrict p_edge = p + (i  )*1;
+
+			bli_dset0s_mxn
+			(
+			  m_edge,
+			  n_edge,
+			  p_edge, 1, ldp
+			);
+		}
+	}
+	if ( k0 < k0_max )
+	{
+		// Handle zero-filling along the "short" (far) edge of the micropanel.
+
+		const dim_t      j      = k0;
+		const dim_t      m_edge = mnr;
+		const dim_t      n_edge = k0_max - k0;
+		double* restrict p_edge = p + (j  )*ldp;
+
+		bli_dset0s_mxn
+		(
+		  m_edge,
+		  n_edge,
+		  p_edge, 1, ldp
+		);
+	}
+}
diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d32xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d32xk.c
new file mode 100644
index 0000000000..1ff964069a
--- /dev/null
+++ b/kernels/zen4/1m/bli_packm_zen4_asm_d32xk.c
@@ -0,0 +1,257 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <x86intrin.h>
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+// Prototype reference packm kernels.
+//PACKM_KER_PROT( double,   d, packm_8xk_zen4_ref )
+
+void bli_dpackm_zen4_asm_32xk
+     (
+       conj_t              conja,
+       pack_t              schema,
+       dim_t               cdim0,
+       dim_t               k0,
+       dim_t               k0_max,
+       double*    restrict kappa,
+       double*    restrict a, inc_t inca0, inc_t lda0,
+       double*    restrict p,              inc_t ldp0,
+       cntx_t*    restrict cntx
+     )
+{
+#if 0
+	bli_dpackm_32xk_zen4_ref
+	(
+	  conja, schema, cdim0, k0, k0_max,
+	  kappa, a, inca0, lda0, p, ldp0, cntx
+	);
+	return;
+#endif
+
+	// This is the panel dimension assumed by the packm kernel.
+	const dim_t      mnr   = 32;
+
+	// This is the "packing" dimension assumed by the packm kernel.
+	// This should be equal to ldp.
+	//const dim_t    packmnr = 8;
+
+	// NOTE: For the purposes of the comments in this packm kernel, we
+	// interpret inca and lda as rs_a and cs_a, respectively, and similarly
+	// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
+	// this packm kernel, you should think of the operation as packing an
+	// m x n micropanel, where m and n are tiny and large, respectively, and
+	// where elements of each column of the packed matrix P are contiguous.
+	// (This packm kernel can still be used to pack micropanels of matrix B
+	// in a gemm operation.)
+	const uint64_t inca   = inca0;
+	const uint64_t lda    = lda0;
+	const uint64_t ldp    = ldp0;
+
+	// NOTE: If/when this kernel ever supports scaling by kappa within the
+	// assembly region, this constraint should be lifted.
+	const bool     unitk  = bli_deq1( *kappa );
+
+	double* restrict pi1        = p;
+
+	// -------------------------------------------------------------------------
+
+	if ( cdim0 == mnr )
+	{
+		if ( unitk )
+		{
+			if ( bli_is_conj( conja ) )
+			{
+				if ( inca == 1 )
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 32 ; i++ ) {
+							bli_dcopyjs( *(a + i), *(pi1 + i) );
+						}
+						a += lda;
+						pi1    += ldp;
+					}
+				}
+				else
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 32 ; i++ ) {
+							bli_dcopyjs( *(a + i*inca), *(pi1 + i) );
+						}
+						a += lda;
+						pi1    += ldp;
+					}
+				}
+			}
+			else
+			{
+				if ( inca == 1 )
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						_mm_prefetch( a + (8*lda), _MM_HINT_T0 );
+						for ( dim_t i = 0 ; i < 32 ; i++ ) {
+							bli_dcopys( *(a + i), *(pi1 + i) );
+						}
+						a += lda;
+						pi1    += ldp;
+					}
+				}
+				else
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 32 ; i++ ) {
+							bli_dcopys( *(a + i*inca), *(pi1 + i) );
+						}
+						a += lda;
+						pi1    += ldp;
+					}
+				}
+			}
+		}
+		else
+		{
+			if ( bli_is_conj( conja ) )
+			{
+				if ( inca == 1 )
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 32 ; i++ ) {
+							bli_dscal2js( *kappa, *(a + i), *(pi1 + i) );
+						}
+						a += lda;
+						pi1    += ldp;
+					}
+				}
+				else
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 32 ; i++ ) {
+							bli_dscal2js( *kappa, *(a + i*inca), *(pi1 + i) );
+						}
+						a += lda;
+						pi1    += ldp;
+					}
+				}
+			}
+			else
+			{
+				if ( inca == 1 )
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 32 ; i++ ) {
+							bli_dscal2s( *kappa, *(a + i), *(pi1 + i) );
+						}
+						a += lda;
+						pi1    += ldp;
+					}
+				}
+				else
+				{
+					for ( dim_t k = k0; k != 0; --k )
+					{
+						for ( dim_t i = 0 ; i < 32 ; i++ ) {
+							bli_dscal2s( *kappa, *(a + i*inca), *(pi1 + i) );
+						}
+						a += lda;
+						pi1    += ldp;
+					}
+				}
+			}
+		}
+	}
+	else // if ( cdim0 < mnr )
+	{
+		PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
+		(
+		  0,
+		  BLIS_NONUNIT_DIAG,
+		  BLIS_DENSE,
+		  ( trans_t )conja,
+		  cdim0,
+		  k0,
+		  kappa,
+		  a, inca0, lda0,
+		  p,     1, ldp0,
+		  cntx,
+		  NULL
+		);
+
+		if ( cdim0 < mnr )
+		{
+			// Handle zero-filling along the "long" edge of the micropanel.
+
+			const dim_t      i      = cdim0;
+			const dim_t      m_edge = mnr - cdim0;
+			const dim_t      n_edge = k0_max;
+			double* restrict p_cast = p;
+			double* restrict p_edge = p_cast + (i  )*1;
+
+			bli_dset0s_mxn
+			(
+			  m_edge,
+			  n_edge,
+			  p_edge, 1, ldp 
+			);
+		}
+	}
+
+	if ( k0 < k0_max )
+	{
+		// Handle zero-filling along the "short" (far) edge of the micropanel.
+
+		const dim_t      j      = k0;
+		const dim_t      m_edge = mnr;
+		const dim_t      n_edge = k0_max - k0;
+		double* restrict p_cast = p;
+		double* restrict p_edge = p_cast + (j  )*ldp;
+
+		bli_dset0s_mxn
+		(
+		  m_edge,
+		  n_edge,
+		  p_edge, 1, ldp 
+		);
+	}
+}
+
diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c
new file mode 100644
index 0000000000..ff18838aab
--- /dev/null
+++ b/kernels/zen4/1m/bli_packm_zen4_asm_d8xk.c
@@ -0,0 +1,439 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+void bli_dpackm_zen4_asm_8xk
+     (
+       conj_t              conja,
+       pack_t              schema,
+       dim_t               cdim0,
+       dim_t               k0,
+       dim_t               k0_max,
+       double*    restrict kappa,
+       double*    restrict a, inc_t inca0, inc_t lda0,
+       double*    restrict p,              inc_t ldp0,
+       cntx_t*    restrict cntx
+     )
+{
+	// This is the panel dimension assumed by the packm kernel.
+	const dim_t      mnr   = 8;
+
+	// This is the "packing" dimension assumed by the packm kernel.
+	// This should be equal to ldp.
+	//const dim_t    packmnr = 8;
+
+	// Typecast local copies of integers in case dim_t and inc_t are a
+	// different size than is expected by load instructions.
+	const uint64_t k_iter = k0 /8;
+	const uint64_t k_left = k0 % 8;
+	/**
+	 * prepares mask for k_left, since we are computing in multiple of 8,
+	 * for edge cases mask is initialized for loading and storing only
+	 * left over elements.
+	 */
+	uint8_t mask = 0xff >> (0x8 - (k_left));
+	if (mask == 0) mask = 0xff;
+
+	// NOTE: For the purposes of the comments in this packm kernel, we
+	// interpret inca and lda as rs_a and cs_a, respectively, and similarly
+	// interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
+	// this packm kernel, you should think of the operation as packing an
+	// m x n micropanel, where m and n are tiny and large, respectively, and
+	// where elements of each column of the packed matrix P are contiguous.
+	// (This packm kernel can still be used to pack micropanels of matrix B
+	// in a gemm operation.)
+	const uint64_t inca   = inca0;
+	const uint64_t lda    = lda0;
+	const uint64_t ldp    = ldp0;
+
+	const bool     gs     = ( inca0 != 1 && lda0 != 1 );
+
+	// NOTE: If/when this kernel ever supports scaling by kappa within the
+	// assembly region, this constraint should be lifted.
+	const bool     unitk  = bli_deq1( *kappa );
+
+	double* restrict a_next = a + cdim0;
+
+	// -------------------------------------------------------------------------
+
+	if ( cdim0 == mnr && !gs && unitk )
+	{
+		begin_asm()
+		mov(var(mask), rdx)                // load mask
+		kmovw(edx, k(2))                   // move mask to k2 register
+		mov(var(a), rax)                   // load address of a.
+		mov(var(a), r13)                   // load address of a.
+		mov(var(inca), r8)                 // load inca
+		mov(var(lda), r10)                 // load lda
+		lea(mem(, r8,  8), r8)             // inca *= sizeof(double)
+		lea(mem(, r10, 8), r10)            // lda *= sizeof(double)
+		mov(var(p), rbx)                   // load address of p.
+		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
+		jz(.DCOLUNIT)                      // jump to column storage case
+
+		// -- kappa unit, row storage on A -----------------------------------------
+
+		label(.DROWUNIT)
+
+		lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
+		lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca
+		lea(mem(r12, r8,  4), rdx)         // rdx = 7*inca
+
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DCONKLEFTROWU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+		label(.DKITERROWU)                 // MAIN LOOP (k_iter)
+
+		/**
+		 * Load first 8 rows of matrix.
+		 * Transpose 8x8 tile and store it back to destination buffer.
+		 */
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(mem(rax,  r8, 1, 0), zmm8)
+		vmovupd(mem(rax,  r8, 2, 0), zmm10)
+		vmovupd(mem(rax, r12, 1, 0), zmm12)
+		vmovupd(mem(rax,  r8, 4, 0), zmm14)
+		vmovupd(mem(rax, rcx, 1, 0), zmm16)
+		vmovupd(mem(rax, r12, 2, 0), zmm18)
+		vmovupd(mem(rax, rdx, 1, 0), zmm20)
+
+		UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+		SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+		UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+		SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+		SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+		SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+		vmovupd(zmm0, mem(rbx, 0*64))
+		vmovupd(zmm4, mem(rbx, 1*64))
+		vmovupd(zmm2, mem(rbx, 2*64))
+		vmovupd(zmm6, mem(rbx, 3*64))
+		vmovupd(zmm1, mem(rbx, 4*64))
+		vmovupd(zmm5, mem(rbx, 5*64))
+		vmovupd(zmm3, mem(rbx, 6*64))
+		vmovupd(zmm8, mem(rbx, 7*64))
+
+		add(imm(8*8), r13)
+		mov(r13, rax)
+		add(imm(8*8*8), rbx)               // p += 8*ldp
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKITERROWU)                   // iterate again if i != 0.
+
+		label(.DCONKLEFTROWU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+
+		label(.DKLEFTROWU)                 // EDGE LOOP (k_left)
+
+		vmovupd(mem(rax,         0), zmm6 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 1, 0), zmm8 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 2, 0), zmm10 MASK_KZ(2))
+		vmovupd(mem(rax, r12, 1, 0), zmm12 MASK_KZ(2))
+		vmovupd(mem(rax,  r8, 4, 0), zmm14 MASK_KZ(2))
+		vmovupd(mem(rax, rcx, 1, 0), zmm16 MASK_KZ(2))
+		vmovupd(mem(rax, r12, 2, 0), zmm18 MASK_KZ(2))
+		vmovupd(mem(rax, rdx, 1, 0), zmm20 MASK_KZ(2))
+
+		UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+		SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+		UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+		SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+		SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+		SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+		cmp(imm(7), rsi)
+		JZ(.UPDATE7L1)
+		cmp(imm(6), rsi)
+		JZ(.UPDATE6L1)
+		cmp(imm(5), rsi)
+		JZ(.UPDATE5L1)
+		cmp(imm(4), rsi)
+		JZ(.UPDATE4L1)
+		cmp(imm(3), rsi)
+		JZ(.UPDATE3L1)
+		cmp(imm(2), rsi)
+		JZ(.UPDATE2L1)
+		cmp(imm(1), rsi)
+		JZ(.UPDATE1L1)
+		cmp(imm(0), rsi)
+		JZ(.UPDATEDONE)
+
+		LABEL(.UPDATE7L1)
+		vmovupd(zmm0, mem(rbx, 0*64))
+		vmovupd(zmm4, mem(rbx, 1*64))
+		vmovupd(zmm2, mem(rbx, 2*64))
+		vmovupd(zmm6, mem(rbx, 3*64))
+		vmovupd(zmm1, mem(rbx, 4*64))
+		vmovupd(zmm5, mem(rbx, 5*64))
+		vmovupd(zmm3, mem(rbx, 6*64))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE6L1)
+		vmovupd(zmm0, mem(rbx, 0*64))
+		vmovupd(zmm4, mem(rbx, 1*64))
+		vmovupd(zmm2, mem(rbx, 2*64))
+		vmovupd(zmm6, mem(rbx, 3*64))
+		vmovupd(zmm1, mem(rbx, 4*64))
+		vmovupd(zmm5, mem(rbx, 5*64))
+		jmp(.UPDATEDONE)                                              // jump to end.
+
+		LABEL(.UPDATE5L1)
+		vmovupd(zmm0, mem(rbx, 0*64))
+		vmovupd(zmm4, mem(rbx, 1*64))
+		vmovupd(zmm2, mem(rbx, 2*64))
+		vmovupd(zmm6, mem(rbx, 3*64))
+		vmovupd(zmm1, mem(rbx, 4*64))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE4L1)
+		vmovupd(zmm0, mem(rbx, 0*64))
+		vmovupd(zmm4, mem(rbx, 1*64))
+		vmovupd(zmm2, mem(rbx, 2*64))
+		vmovupd(zmm6, mem(rbx, 3*64))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE3L1)
+		vmovupd(zmm0, mem(rbx, 0*64))
+		vmovupd(zmm4, mem(rbx, 1*64))
+		vmovupd(zmm2, mem(rbx, 2*64))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE2L1)
+		vmovupd(zmm0, mem(rbx, 0*64))
+		vmovupd(zmm4, mem(rbx, 1*64))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATE1L1)
+		vmovupd(zmm0, mem(rbx, 0*64))
+		jmp(.UPDATEDONE)
+
+		LABEL(.UPDATEDONE)
+
+		jmp(.DDONE)                        // jump to end.
+
+		// -- kappa unit, column storage on A -----------------------------------------
+		label(.DCOLUNIT)
+		mov(var(a_next), rcx)
+		mov(var(ldp), r8)                  // load lda
+		lea(mem(, r8,  8), r8)             // inca *= sizeof(double)
+		mov(var(k_iter), rsi)              // i = k_iter;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DCONKLEFTCOLU)                 // if i == 0, jump to code that
+		                                   // contains the k_left loop.
+
+
+		label(.DKITERCOLU)                 // MAIN LOOP (k_iter)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx))
+		add(r10, rax)
+		add(r8, rbx)
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx,))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx))
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKITERCOLU)                   // iterate again if i != 0.
+
+		label(.DCONKLEFTCOLU)
+
+		mov(var(k_left), rsi)              // i = k_left;
+		test(rsi, rsi)                     // check i via logical AND.
+		je(.DDONE)                         // if i == 0, we're done; jump to end.
+		                                   // else, we prepare to enter k_left loop.
+		label(.DKLEFTCOLU)                 // EDGE LOOP (k_left)
+
+		vmovupd(mem(rax,         0), zmm6)
+		vmovupd(zmm6, mem(rbx))
+
+		add(r10, rax)
+		add(r8, rbx)
+
+		dec(rsi)                           // i -= 1;
+		jne(.DKLEFTCOLU)                   // iterate again if i != 0.
+
+		label(.DDONE)
+
+		end_asm(
+		: // output operands (none)
+		: // input operands
+		  [mask] "m" (mask),
+		  [k_iter] "m" (k_iter),
+		  [k_left] "m" (k_left),
+		  [a]      "m" (a),
+		  [inca]   "m" (inca),
+		  [lda]    "m" (lda),
+		  [p]      "m" (p),
+		  [ldp]    "m" (ldp),
+		  [kappa]  "m" (kappa),
+		  [a_next]    "m" (a_next)
+		: // register clobber list
+		  "rax", "rbx", "rcx", "rdx", "rsi",
+		  "r8", "r10", "r12", "r13",
+		  "zmm0", "zmm1", "zmm2", "zmm3",
+		  "zmm4", "zmm5", "zmm6", "zmm7",
+		  "zmm8", "zmm9", "zmm10", "zmm11",
+		  "zmm12", "zmm13", "zmm14", "zmm15",
+		  "zmm16", "zmm18", "zmm20", "zmm30", "zmm31", "k2", "memory"
+		)
+	}
+	else // if ( cdim0 < mnr || gs || !unitk )
+	{
+		PASTEMAC(dscal2m,BLIS_TAPI_EX_SUF)
+		(
+		  0,
+		  BLIS_NONUNIT_DIAG,
+		  BLIS_DENSE,
+		  ( trans_t )conja,
+		  cdim0,
+		  k0,
+		  kappa,
+		  a, inca0, lda0,
+		  p,     1, ldp0,
+		  cntx,
+		  NULL
+		);
+
+		if ( cdim0 < mnr )
+		{
+			// Handle zero-filling along the "long" edge of the micropanel.
+
+			const dim_t      i      = cdim0;
+			const dim_t      m_edge = mnr - cdim0;
+			const dim_t      n_edge = k0_max;
+			double* restrict p_edge = p + (i  )*1;
+
+			bli_dset0s_mxn
+			(
+			  m_edge,
+			  n_edge,
+			  p_edge, 1, ldp
+			);
+		}
+	}
+	if ( k0 < k0_max )
+	{
+		// Handle zero-filling along the "short" (far) edge of the micropanel.
+
+		const dim_t      j      = k0;
+		const dim_t      m_edge = mnr;
+		const dim_t      n_edge = k0_max - k0;
+		double* restrict p_edge = p + (j  )*ldp;
+
+		bli_dset0s_mxn
+		(
+		  m_edge,
+		  n_edge,
+		  p_edge, 1, ldp
+		);
+	}
+}
diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c
new file mode 100644
index 0000000000..3145801e11
--- /dev/null
+++ b/kernels/zen4/1m/bli_packm_zen4_asm_z12xk.c
@@ -0,0 +1,372 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/******************************************************/
+/* Transpose contents of R0, R1, R2, R3 and store     */
+/* the result to same register                        */
+/* Transpose 4x4 register                             */
+/* Input R0 = Ar0 Ai0 Ar1 Ai1 Ar2 Ai2 Ar3 Ai3         */
+/* Input R1 = Ar4 Ai4 Ar5 Ai5 Ar6 Ai6 Ar7 Ai7         */
+/* Input R2 = Ar8 Ai8 Ar9 Ai9 Ar10 Ai10 Ar11 Ai11     */
+/* Input R3 = Ar12 Ai12 Ar13 Ai13 Ar14 Ai14 Ar15 Ai15 */
+/* ZMM12 = Ar0 Ai0 Ar2 Ai2 Ar4 Ai4 Ar6 Ai6            */
+/* ZMM13 = Ar1 Ai1 Ar3 Ai3 Ar5 Ai5 Ar7 Ai7            */
+/* ZMM14 = Ar8 Ai8 Ar10 Ai10 Ar12 Ai12 Ar14 Ai14      */
+/* ZMM15 = Ar9 Ai9 Ar11 Ai11 Ar13 Ai13 Ar15 Ai15      */
+/* Output R0 = Ar0 Ai0 Ar4 Ai4 Ar8 Ai8 Ar12 Ai12      */
+/* Output R2 = Ar1 Ai1 Ar5 Ai5 Ar9 Ai9 Ar13 Ai13      */
+/* Output R1 = Ar2 Ai2 Ar6 Ai6 Ar10 Ai10 Ar14 Ai14    */
+/* Output R3 = Ar3 Ai3 Ar7 Ai7 Ar11 Ai11 Ar15 Ai15    */
+/******************************************************/
+#define TRANSPOSE(R0, R1, R2, R3) \
+    VSHUFF64X2(IMM(0x88), ZMM(R1), ZMM(R0), ZMM(12)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(R1), ZMM(R0), ZMM(13)) \
+    VSHUFF64X2(IMM(0x88), ZMM(R3), ZMM(R2), ZMM(14)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(R3), ZMM(R2), ZMM(15)) \
+    VSHUFF64X2(IMM(0x88), ZMM(14), ZMM(12), ZMM(R0)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(14), ZMM(12), ZMM(R2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(15), ZMM(13), ZMM(R1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(15), ZMM(13), ZMM(R3))
+
+void bli_zpackm_zen4_asm_12xk
+     (
+       conj_t              conja,
+       pack_t              schema,
+       dim_t               cdim0,
+       dim_t               k0,
+       dim_t               k0_max,
+       dcomplex*  restrict kappa,
+       dcomplex*  restrict a, inc_t inca0, inc_t lda0,
+       dcomplex*  restrict p,              inc_t ldp0,
+       cntx_t*    restrict cntx
+     )
+{
+    // This is the panel dimension assumed by the packm kernel.
+    const dim_t      mnr   = 12;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    const uint64_t k_iter =  k0 / 4;
+    const uint64_t k_left = k0 % 4;
+
+    // NOTE: For the purposes of the comments in this packm kernel, we
+    // interpret inca and lda as rs_a and cs_a, respectively, and similarly
+    // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
+    // this packm kernel, you should think of the operation as packing an
+    // m x n micropanel, where m and n are tiny and large, respectively, and
+    // where elements of each column of the packed matrix P are contiguous.
+    // (This packm kernel can still be used to pack micropanels of matrix B
+    // in a gemm operation.)
+    const uint64_t inca   = inca0;
+    const uint64_t lda    = lda0;
+    const uint64_t ldp    = ldp0;
+
+    const bool     gs     = ( inca0 != 1 && lda0 != 1 );
+
+    // NOTE: If/when this kernel ever supports scaling by kappa within the
+    // assembly region, this constraint should be lifted.
+    const bool     unitk  = bli_zeq1( *kappa );
+
+    // -------------------------------------------------------------------------
+    if (cdim0 == mnr && !gs && !conja && unitk)
+    {
+        begin_asm()
+
+        mov(var(a), rax)                   // load address of a.
+
+        mov(var(inca), r8)                 // load inca
+        mov(var(lda), r10)                 // load lda
+        lea(mem(   , r8,  2), r8)
+        lea(mem(   , r8,  8), r8)          // inca *= sizeof(dcomplex)
+        lea(mem(   , r10, 2), r10)
+        lea(mem(   , r10, 8), r10)         // lda *= sizeof(dcomplex)
+
+        mov(var(p), rbx)                   // load address of p.
+
+        lea(mem(   , r10, 4), r14)         // r14 = 4*lda
+
+        cmp(imm(16), r8)                   // set ZF if (16*inca) == 16.
+        jz(.ZCOLUNIT)                      // jump to column storage case
+
+        // -- row storage on A -----------------------------------------
+
+        label(.ZROWUNIT)
+
+        lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
+        lea(mem(r12, r8,  2), rcx)         // rcx = 5*inca 3+2
+        lea(mem(rcx, r8,  2), rdx)         // rdx = 7*inca 3+4
+        lea(mem(rdx, r8,  2), r9)          // rdx = 9*inca 7+2
+        lea(mem(r12, r8,  8), r15)         // rdx = 11*inca 3+8
+
+        mov(var(k_iter), rsi)              // i = k_iter;
+        test(rsi, rsi)                     // check i via logical AND.
+        je(.ZCONKLEFTROWU)                 // if i == 0, jump to code that
+                                           // contains the k_left loop.
+
+        label(.ZKITERROWU)                 // MAIN LOOP (k_iter)
+
+        vmovupd(mem(rax,          0), zmm0)
+        vmovupd(mem(rax,  r8,  1, 0), zmm1)
+        vmovupd(mem(rax,  r8,  2, 0), zmm2)
+        vmovupd(mem(rax,  r12, 1, 0), zmm3)
+        vmovupd(mem(rax,  r8,  4, 0), zmm4)
+        vmovupd(mem(rax,  rcx, 1, 0), zmm5)
+        vmovupd(mem(rax,  r12, 2, 0), zmm6)
+        vmovupd(mem(rax,  rdx, 1, 0), zmm7)
+        vmovupd(mem(rax,  r8,  8, 0), zmm8)
+        vmovupd(mem(rax,  r9,  1, 0), zmm9)
+        vmovupd(mem(rax,  rcx, 2, 0), zmm10)
+        vmovupd(mem(rax,  r15, 1, 0), zmm11)
+
+        TRANSPOSE(0, 1, 2,  3)
+        TRANSPOSE(4, 5, 6,  7)
+        TRANSPOSE(8, 9, 10, 11)
+
+        add(r14, rax)                      // a += 12*lda;
+
+        vmovupd(zmm0, mem(rbx, 0*64))
+        vmovupd(zmm4, mem(rbx, 1*64))
+        vmovupd(zmm8, mem(rbx, 2*64))
+        vmovupd(zmm1, mem(rbx, 3*64))
+        vmovupd(zmm5, mem(rbx, 4*64))
+        vmovupd(zmm9, mem(rbx, 5*64))
+        vmovupd(zmm2, mem(rbx, 6*64))
+        vmovupd(zmm6, mem(rbx, 7*64))
+        vmovupd(zmm10, mem(rbx, 8*64))
+        vmovupd(zmm3, mem(rbx, 9*64))
+        vmovupd(zmm7, mem(rbx, 10*64))
+        vmovupd(zmm11, mem(rbx, 11*64))
+
+        add(imm(4*12*16), rbx)              // p += 4*ldp = 4*12;
+
+        dec(rsi)                           // i -= 1;
+
+        jne(.ZKITERROWU)                   // iterate again if i != 0.
+
+        label(.ZCONKLEFTROWU)
+
+        mov(var(k_left), rsi)              // i = k_left;
+        test(rsi, rsi)                     // check i via logical AND.
+        je(.ZDONE)                         // if i == 0, we're done; jump to end.
+                                           // else, we prepare to enter k_left loop.
+
+        label(.ZKLEFTROWU)                 // EDGE LOOP (k_left)
+
+        vmovupd(mem(rax,          0), xmm0)
+        vmovupd(mem(rax,  r8,  1, 0), xmm1)
+        vmovupd(mem(rax,  r8,  2, 0), xmm2)
+        vmovupd(mem(rax,  r12, 1, 0), xmm3)
+        vmovupd(mem(rax,  r8,  4, 0), xmm4)
+        vmovupd(mem(rax,  rcx, 1, 0), xmm5)
+        vmovupd(mem(rax,  r12, 2, 0), xmm6)
+        vmovupd(mem(rax,  rdx, 1, 0), xmm7)
+        vmovupd(mem(rax,  r8,  8, 0), xmm8)
+        vmovupd(mem(rax,  r9,  1, 0), xmm9)
+        vmovupd(mem(rax,  rcx, 2, 0), xmm10)
+        vmovupd(mem(rax,  r15, 1, 0), xmm11)
+
+        add(r10, rax)                      // a += lda;
+
+        vmovupd(xmm0, mem(rbx))
+        vmovupd(xmm1, mem(rbx,   1*16))
+        vmovupd(xmm2, mem(rbx,   2*16))
+        vmovupd(xmm3, mem(rbx,   3*16))
+        vmovupd(xmm4, mem(rbx,   4*16))
+        vmovupd(xmm5, mem(rbx,   5*16))
+        vmovupd(xmm6, mem(rbx,   6*16))
+        vmovupd(xmm7, mem(rbx,   7*16))
+        vmovupd(xmm8, mem(rbx,   8*16))
+        vmovupd(xmm9, mem(rbx,   9*16))
+        vmovupd(xmm10, mem(rbx, 10*16))
+        vmovupd(xmm11, mem(rbx, 11*16))
+
+        add(imm(12*16), rbx)                // p += ldp = 4;
+
+        dec(rsi)                           // i -= 1;
+        jne(.ZKLEFTROWU)                   // iterate again if i != 0.
+
+        jmp(.ZDONE)                        // jump to end.
+
+        // --column storage on A --------------------------------------
+
+        label(.ZCOLUNIT)
+
+        lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
+
+        mov(var(k_iter), rsi)              // i = k_iter;
+        test(rsi, rsi)                     // check i via logical AND.
+        je(.ZCONKLEFTCOLU)                 // if i == 0, jump to code that
+                                           // contains the k_left loop.
+
+        label(.ZKITERCOLU)                 // MAIN LOOP (k_iter)
+
+        vmovupd(mem(rax,          0),  zmm0)
+        vmovupd(mem(rax,         64),  zmm1)
+        vmovupd(mem(rax,         128), zmm2)
+        vmovupd(zmm0, mem(rbx,  0))
+        vmovupd(zmm1, mem(rbx, 64))
+        vmovupd(zmm2, mem(rbx, 128))
+
+        vmovupd(mem(rax, r10, 1,  0),  zmm3)
+        vmovupd(mem(rax, r10, 1, 64),  zmm4)
+        vmovupd(mem(rax, r10, 1, 128), zmm5)
+        vmovupd(zmm3, mem(rbx, 1*192+ 0))
+        vmovupd(zmm4, mem(rbx, 1*192+64))
+        vmovupd(zmm5, mem(rbx, 1*192+128))
+
+        vmovupd(mem(rax, r10, 2,  0),  zmm6)
+        vmovupd(mem(rax, r10, 2, 64),  zmm7)
+        vmovupd(mem(rax, r10, 2, 128), zmm8)
+        vmovupd(zmm6, mem(rbx, 2*192+ 0))
+        vmovupd(zmm7, mem(rbx, 2*192+64))
+        vmovupd(zmm8, mem(rbx, 2*192+128))
+
+        vmovupd(mem(rax, r13, 1,  0),  zmm9)
+        vmovupd(mem(rax, r13, 1, 64),  zmm10)
+        vmovupd(mem(rax, r13, 1, 128), zmm11)
+        add(r14, rax)                      // a += 4*lda;
+        vmovupd(zmm9,  mem(rbx, 3*192+ 0))
+        vmovupd(zmm10, mem(rbx, 3*192+64))
+        vmovupd(zmm11, mem(rbx, 3*192+128))
+        add(imm(4*12*16), rbx)               // p += 4*ldp = 4*12;
+
+        dec(rsi)                           // i -= 1;
+        jne(.ZKITERCOLU)                   // iterate again if i != 0.
+
+        label(.ZCONKLEFTCOLU)
+
+        mov(var(k_left), rsi)              // i = k_left;
+        test(rsi, rsi)                     // check i via logical AND.
+        je(.ZDONE)                         // if i == 0, we're done; jump to end.
+                                           // else, we prepare to enter k_left loop.
+        label(.ZKLEFTCOLU)                 // EDGE LOOP (k_left)
+
+        vmovupd(mem(rax,   0), zmm0)
+        vmovupd(mem(rax,  64), zmm1)
+        vmovupd(mem(rax, 128), zmm2)
+        add(r10, rax)                      // a += lda;
+
+        vmovupd(zmm0, mem(rbx,  0))
+        vmovupd(zmm1, mem(rbx, 64))
+        vmovupd(zmm2, mem(rbx, 128))
+
+        add(imm(12*16), rbx)                // p += ldp = 3;
+
+        dec(rsi)                           // i -= 1;
+        jne(.ZKLEFTCOLU)                   // iterate again if i != 0.
+
+        label(.ZDONE)
+
+        end_asm(
+        : // output operands (none)
+        : // input operands
+          [k_iter] "m" (k_iter),
+          [k_left] "m" (k_left),
+          [a]      "m" (a),
+          [inca]   "m" (inca),
+          [lda]    "m" (lda),
+          [p]      "m" (p),
+          [ldp]    "m" (ldp),
+          [k0]     "m" (k0)
+        : // register clobber list
+          "rax", "rbx", "rcx", "rdx", "rsi",
+          "r8", "r9", "r10", "r12", "r13", "r14", "r15",
+          "xmm0", "xmm1", "xmm2", "xmm3",
+          "xmm4", "xmm5", "xmm6", "xmm7",
+          "xmm8", "xmm9", "xmm10", "xmm11",
+          "zmm0", "zmm1", "zmm2", "zmm3",
+          "zmm4", "zmm5", "zmm6", "zmm7",
+          "zmm8", "zmm9", "zmm10", "zmm11",
+          "zmm12", "zmm13", "zmm14", "zmm15",
+          "memory"
+        )
+    }
+    else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk )
+    {
+        PASTEMAC(zscal2m,BLIS_TAPI_EX_SUF)
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim0,
+          k0,
+          kappa,
+          a, inca0, lda0,
+          p,     1, ldp0,
+          cntx,
+          NULL
+        );
+
+        if ( cdim0 < mnr )
+        {
+            // Handle zero-filling along the "long" edge of the micropanel.
+
+            const dim_t        i      = cdim0;
+            const dim_t        m_edge = mnr - cdim0;
+            const dim_t        n_edge = k0_max;
+            dcomplex* restrict p_edge = p + (i  )*1;
+
+            bli_zset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( k0 < k0_max )
+    {
+        // Handle zero-filling along the "short" (far) edge of the micropanel.
+
+        const dim_t        j      = k0;
+        const dim_t        m_edge = mnr;
+        const dim_t        n_edge = k0_max - k0;
+        dcomplex* restrict p_edge = p + (j  )*ldp;
+
+        bli_zset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
+
diff --git a/kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c b/kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c
new file mode 100644
index 0000000000..02f2776c17
--- /dev/null
+++ b/kernels/zen4/1m/bli_packm_zen4_asm_z4xk.c
@@ -0,0 +1,306 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+/******************************************************/
+/* Transpose contents of R0, R1, R2, R3 and store     */
+/* the result to same register                        */
+/* Transpose 4x4 register                             */
+/* Input R0 = Ar0 Ai0 Ar1 Ai1 Ar2 Ai2 Ar3 Ai3         */
+/* Input R1 = Ar4 Ai4 Ar5 Ai5 Ar6 Ai6 Ar7 Ai7         */
+/* Input R2 = Ar8 Ai8 Ar9 Ai9 Ar10 Ai10 Ar11 Ai11     */
+/* Input R3 = Ar12 Ai12 Ar13 Ai13 Ar14 Ai14 Ar15 Ai15 */
+/* ZMM4 = Ar0 Ai0 Ar2 Ai2 Ar4 Ai4 Ar6 Ai6            */
+/* ZMM5 = Ar1 Ai1 Ar3 Ai3 Ar5 Ai5 Ar7 Ai7            */
+/* ZMM6 = Ar8 Ai8 Ar10 Ai10 Ar12 Ai12 Ar14 Ai14      */
+/* ZMM7 = Ar9 Ai9 Ar11 Ai11 Ar13 Ai13 Ar15 Ai15      */
+/* Output R0 = Ar0 Ai0 Ar4 Ai4 Ar8 Ai8 Ar12 Ai12      */
+/* Output R1 = Ar1 Ai1 Ar5 Ai5 Ar9 Ai9 Ar13 Ai13      */
+/* Output R2 = Ar2 Ai2 Ar6 Ai6 Ar10 Ai10 Ar14 Ai14    */
+/* Output R3 = Ar3 Ai3 Ar7 Ai7 Ar11 Ai11 Ar15 Ai15    */
+/******************************************************/
+#define TRANSPOSE(R0, R1, R2, R3) \
+    VSHUFF64X2(IMM(0x88), ZMM(R1), ZMM(R0), ZMM(4)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(R1), ZMM(R0), ZMM(5)) \
+    VSHUFF64X2(IMM(0x88), ZMM(R3), ZMM(R2), ZMM(6)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(R3), ZMM(R2), ZMM(7)) \
+    VSHUFF64X2(IMM(0x88), ZMM(6), ZMM(4), ZMM(R0)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(6), ZMM(4), ZMM(R2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(7), ZMM(5), ZMM(R1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(7), ZMM(5), ZMM(R3))
+
+void bli_zpackm_zen4_asm_4xk
+     (
+       conj_t              conja,
+       pack_t              schema,
+       dim_t               cdim0,
+       dim_t               k0,
+       dim_t               k0_max,
+       dcomplex*  restrict kappa,
+       dcomplex*  restrict a, inc_t inca0, inc_t lda0,
+       dcomplex*  restrict p,              inc_t ldp0,
+       cntx_t*    restrict cntx
+     )
+{
+    // This is the panel dimension assumed by the packm kernel.
+    const dim_t      mnr   = 4;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    const uint64_t k_iter = k0 / 4;
+    const uint64_t k_left = k0 % 4;
+
+    // NOTE: For the purposes of the comments in this packm kernel, we
+    // interpret inca and lda as rs_a and cs_a, respectively, and similarly
+    // interpret ldp as cs_p (with rs_p implicitly unit). Thus, when reading
+    // this packm kernel, you should think of the operation as packing an
+    // m x n micropanel, where m and n are tiny and large, respectively, and
+    // where elements of each column of the packed matrix P are contiguous.
+    // (This packm kernel can still be used to pack micropanels of matrix B
+    // in a gemm operation.)
+    const uint64_t inca   = inca0;
+    const uint64_t lda    = lda0;
+    const uint64_t ldp    = ldp0;
+
+    const bool     gs     = ( inca0 != 1 && lda0 != 1 );
+
+    // NOTE: If/when this kernel ever supports scaling by kappa within the
+    // assembly region, this constraint should be lifted.
+    const bool     unitk  = bli_zeq1( *kappa );
+
+    // -------------------------------------------------------------------------
+    if ( cdim0 == mnr && !gs && !conja && unitk )
+    {
+        begin_asm()
+
+        mov(var(a), rax)                   // load address of a.
+
+        mov(var(inca), r8)                 // load inca
+        mov(var(lda), r10)                 // load lda
+        lea(mem(   , r8,  2), r8)
+        lea(mem(   , r8,  8), r8)          // inca *= sizeof(dcomplex)
+        lea(mem(   , r10, 2), r10)
+        lea(mem(   , r10, 8), r10)         // lda *= sizeof(dcomplex)
+
+        mov(var(p), rbx)                   // load address of p.
+
+        lea(mem(   , r10, 4), r14)         // r14 = 4*lda
+
+        cmp(imm(16), r8)                   // set ZF if (16*inca) == 16.
+        jz(.ZCOLUNIT)                      // jump to column storage case
+
+        // -- row storage on A -----------------------------------------
+
+        label(.ZROWUNIT)
+
+        lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
+
+        mov(var(k_iter), rsi)              // i = k_iter;
+        test(rsi, rsi)                     // check i via logical AND.
+        je(.ZCONKLEFTROWU)                 // if i == 0, jump to code that
+                                           // contains the k_left loop.
+        label(.ZKITERROWU)                 // MAIN LOOP (k_iter)
+
+        vmovupd(mem(rax,         0), zmm0)
+        vmovupd(mem(rax,  r8, 1, 0), zmm1)
+        vmovupd(mem(rax,  r8, 2, 0), zmm2)
+        vmovupd(mem(rax, r12, 1, 0), zmm3)
+
+        TRANSPOSE(0, 1, 2, 3)
+
+        vmovupd(zmm0, mem(rbx, 0*64))
+        vmovupd(zmm1, mem(rbx, 1*64))
+        vmovupd(zmm2, mem(rbx, 2*64))
+        vmovupd(zmm3, mem(rbx, 3*64))
+
+        add(r14, rax)                      // a += 4*lda;
+
+        add(imm(4*4*16), rbx)              // p += 4*ldp = 4*4;
+
+        dec(rsi)                           // i -= 1;
+        jne(.ZKITERROWU)                   // iterate again if i != 0.
+
+        label(.ZCONKLEFTROWU)
+
+        mov(var(k_left), rsi)              // i = k_left;
+        test(rsi, rsi)                     // check i via logical AND.
+        je(.ZDONE)                         // if i == 0, we're done; jump to end.
+                                           // else, we prepare to enter k_left loop.
+
+        label(.ZKLEFTROWU)                 // EDGE LOOP (k_left)
+
+        vmovups(mem(rax,         0), xmm0)
+        vmovups(mem(rax,  r8, 1, 0), xmm1)
+        vmovups(mem(rax,  r8, 2, 0), xmm2)
+        vmovups(mem(rax, r12, 1, 0), xmm3)
+
+        add(r10, rax)                      // a += lda;
+
+        vmovups(xmm0, mem(rbx, 0*16+0*64))
+        vmovups(xmm1, mem(rbx, 1*16+0*64))
+        vmovups(xmm2, mem(rbx, 2*16+0*64))
+        vmovups(xmm3, mem(rbx, 3*16+0*64))
+
+        add(imm(4*16), rbx)                // p += ldp = 4;
+
+        dec(rsi)                           // i -= 1;
+        jne(.ZKLEFTROWU)                   // iterate again if i != 0.
+
+        jmp(.ZDONE)                        // jump to end.
+
+        // -- column storage on A --------------------------------------
+
+        label(.ZCOLUNIT)
+
+        lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
+
+        mov(var(k_iter), rsi)              // i = k_iter;
+        test(rsi, rsi)                     // check i via logical AND.
+        je(.ZCONKLEFTCOLU)                 // if i == 0, jump to code that
+                                           // contains the k_left loop.
+
+        label(.ZKITERCOLU)                 // MAIN LOOP (k_iter)
+
+        vmovupd(mem(rax,          0), zmm0)
+        vmovupd(zmm0, mem(rbx, 0*64))
+
+        vmovupd(mem(rax, r10, 1,  0), zmm1)
+        vmovupd(zmm1, mem(rbx, 1*64))
+
+        vmovupd(mem(rax, r10, 2,  0), zmm2)
+        vmovupd(zmm2, mem(rbx, 2*64))
+
+        vmovupd(mem(rax, r13, 1,  0), zmm3)
+        add(r14, rax)                      // a += 4*lda;
+        vmovupd(zmm3, mem(rbx, 3*64))
+        add(imm(4*4*16), rbx)               // p += 4*ldp = 4*4;
+
+        dec(rsi)                           // i -= 1;
+        jne(.ZKITERCOLU)                   // iterate again if i != 0.
+
+        label(.ZCONKLEFTCOLU)
+
+        mov(var(k_left), rsi)              // i = k_left;
+        test(rsi, rsi)                     // check i via logical AND.
+        je(.ZDONE)                         // if i == 0, we're done; jump to end.
+                                           // else, we prepare to enter k_left loop.
+
+        label(.ZKLEFTCOLU)                 // EDGE LOOP (k_left)
+
+        vmovupd(mem(rax,          0), zmm0)
+        add(r10, rax)                      // a += lda;
+        vmovupd(zmm0, mem(rbx))
+        add(imm(4*16), rbx)                // p += ldp = 4;
+
+        dec(rsi)                           // i -= 1;
+        jne(.ZKLEFTCOLU)                   // iterate again if i != 0.
+
+        label(.ZDONE)
+
+        end_asm(
+        : // output operands (none)
+        : // input operands
+          [k_iter] "m" (k_iter),
+          [k_left] "m" (k_left),
+          [a]      "m" (a),
+          [inca]   "m" (inca),
+          [lda]    "m" (lda),
+          [p]      "m" (p),
+          [ldp]    "m" (ldp)
+        : // register clobber list
+          "rax", "rbx", "rcx", "rdx", "rsi",
+          "r8", "r10", "r12", "r13", "r14",
+          "xmm0", "xmm1", "xmm2", "xmm3",
+          "zmm0", "zmm1", "zmm2", "zmm3",
+          "zmm4", "zmm5", "zmm6", "zmm7",
+          "memory"
+        )
+    }
+    else // if ( cdim0 < mnr || gs || bli_does_conj( conja ) || !unitk )
+    {
+        PASTEMAC(zscal2m,BLIS_TAPI_EX_SUF)
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim0,
+          k0,
+          kappa,
+          a, inca0, lda0,
+          p,     1, ldp0,
+          cntx,
+          NULL
+        );
+
+        if ( cdim0 < mnr )
+        {
+            // Handle zero-filling along the "long" edge of the micropanel.
+
+            const dim_t        i      = cdim0;
+            const dim_t        m_edge = mnr - cdim0;
+            const dim_t        n_edge = k0_max;
+            dcomplex* restrict p_edge = p + (i  )*1;
+
+            bli_zset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( k0 < k0_max )
+    {
+        // Handle zero-filling along the "short" (far) edge of the micropanel.
+
+        const dim_t        j      = k0;
+        const dim_t        m_edge = mnr;
+        const dim_t        n_edge = k0_max - k0;
+        dcomplex* restrict p_edge = p + (j  )*ldp;
+
+        bli_zset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
+
diff --git a/kernels/zen4/3/CMakeLists.txt b/kernels/zen4/3/CMakeLists.txt
index 381204ae68..0b38920998 100644
--- a/kernels/zen4/3/CMakeLists.txt
+++ b/kernels/zen4/3/CMakeLists.txt
@@ -1,7 +1,20 @@
-##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.##
 
-target_sources("${PROJECT_NAME}" 
-     PRIVATE
+add_library(zen4_3
+     OBJECT
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_zen_16x14.c
     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_zen_16x14.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_l_zen4_8x24.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmtrsm_u_zen4_8x24.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_zen4_asm_32x6.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemm_zen4_asm_8x24.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_trsm_small_AVX512.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_zgemm_zen4_asm_12x4.c
     )
+
+target_compile_options(zen4_3 PRIVATE /arch:AVX2 /arch:AVX512)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen4_3 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
+
+add_subdirectory(sup)
diff --git a/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c b/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c
new file mode 100644
index 0000000000..c20c0ab898
--- /dev/null
+++ b/kernels/zen4/3/bli_dgemm_zen4_asm_32x6.c
@@ -0,0 +1,483 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022 - 2023, Advanced Micro Devices, Inc.All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "bli_x86_asm_macros.h"
+
+#define TAIL_NITER 5 // in units of 4x unrolled k iterations
+                     // e.g. 5 -> 4*5 k iterations ~= 280 cycles
+
+
+/*
+ * A Registers:  ZMM0, ZMM1, ZMM2, ZMM3
+ * B Registers:  ZMM4, xMM5, xMM6, xMM7
+ * C Registers:  ZMM[8-31]
+ */
+
+#define LOOP_ALIGN ALIGN32
+
+#define UPDATE_C(R1,R2,R3,R4) \
+\
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX)) \
+    VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,64)) \
+    VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,128)) \
+    VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,192)) \
+    VMOVUPD(MEM(RCX), ZMM(R1)) \
+    VMOVUPD(MEM(RCX,64), ZMM(R2)) \
+    VMOVUPD(MEM(RCX,128), ZMM(R3)) \
+    VMOVUPD(MEM(RCX,192), ZMM(R4)) \
+    LEA(RCX, MEM(RCX,RBX,1))
+
+#define UPDATE_C_BZ(R1,R2,R3,R4) \
+\
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VMOVUPD(MEM(RCX), ZMM(R1)) \
+    VMOVUPD(MEM(RCX,64), ZMM(R2)) \
+    VMOVUPD(MEM(RCX,128), ZMM(R3)) \
+    VMOVUPD(MEM(RCX,192), ZMM(R4)) \
+    LEA(RCX, MEM(RCX,RBX,1))
+
+
+#define UPDATE_C_COL_SCATTERED(R1,R2,R3,R4) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    KXNORW(K(4), K(0), K(0)) \
+    VGATHERQPD(ZMM(0) MASK_K(1), MEM(RCX,ZMM(4),1)) \
+    VFMADD231PD(ZMM(R1), ZMM(0), ZMM(1)) \
+    VGATHERQPD(ZMM(0) MASK_K(2), MEM(RCX,ZMM(5),1)) \
+    VFMADD231PD(ZMM(R2), ZMM(0), ZMM(1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(4),1) MASK_K(3), ZMM(R1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(5),1) MASK_K(4), ZMM(R2)) \
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    KXNORW(K(4), K(0), K(0)) \
+    VGATHERQPD(ZMM(0) MASK_K(1), MEM(RCX,ZMM(6),1)) \
+    VFMADD231PD(ZMM(R3), ZMM(0), ZMM(1)) \
+    VGATHERQPD(ZMM(0) MASK_K(2), MEM(RCX,ZMM(7),1)) \
+    VFMADD231PD(ZMM(R4), ZMM(0), ZMM(1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(6),1) MASK_K(3), ZMM(R3))  \
+    VSCATTERQPD(MEM(RCX,ZMM(7),1) MASK_K(4), ZMM(R4))  \
+    LEA(RCX, MEM(RCX,RBX,1))
+
+#define UPDATE_C_BZ_COL_SCATTERED(R1,R2,R3,R4) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    KXNORW(K(4), K(0), K(0)) \
+    VSCATTERQPD(MEM(RCX,ZMM(4),1) MASK_K(1), ZMM(R1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(5),1) MASK_K(2), ZMM(R2)) \
+    VSCATTERQPD(MEM(RCX,ZMM(6),1) MASK_K(3), ZMM(R3)) \
+    VSCATTERQPD(MEM(RCX,ZMM(7),1) MASK_K(4), ZMM(R4)) \
+    LEA(RCX, MEM(RCX,RBX,1))
+
+
+#if 0
+#define SUBITER(n) \
+\
+    VMOVUPD(XMM( 5), MEM(RBX,(6*n+ 0)*8)) \
+    \
+    VBROADCASTSD(ZMM( 4), XMM( 5)) \
+    VPERMILPD(XMM( 5), XMM( 5), IMM(3)) \
+    VFMADD231PD(ZMM( 8), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM( 9), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(10), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(11), ZMM(3), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM( 4), XMM( 5)) \
+    VMOVUPD(XMM( 6), MEM(RBX,(6*n+ 2)*8)) \
+    VFMADD231PD(ZMM(12), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(13), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(14), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(15), ZMM(3), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM( 4), XMM( 6)) \
+    VPERMILPD(XMM( 6), XMM( 6), IMM(3)) \
+    VFMADD231PD(ZMM(16), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(17), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(18), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(19), ZMM(3), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM( 4), XMM( 6)) \
+    VMOVUPD(XMM( 7), MEM(RBX,(6*n+ 4)*8)) \
+    VFMADD231PD(ZMM(20), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(21), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(22), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(23), ZMM(3), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM( 4), XMM( 7)) \
+    VPERMILPD(XMM( 7), XMM( 7), IMM(3)) \
+    VFMADD231PD(ZMM(24), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(25), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(26), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(27), ZMM(3), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM( 4), XMM( 7)) \
+    VFMADD231PD(ZMM(28), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(29), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(30), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(31), ZMM(3), ZMM(4)) \
+    \
+    VMOVAPD(ZMM(0), MEM(RAX,(32*n+ 0)*8)) \
+    VMOVAPD(ZMM(1), MEM(RAX,(32*n+ 8)*8)) \
+    VMOVAPD(ZMM(2), MEM(RAX,(32*n+16)*8)) \
+    VMOVAPD(ZMM(3), MEM(RAX,(32*n+24)*8))
+
+
+#else
+
+#define SUBITER(n) \
+\
+    VBROADCASTSD(ZMM( 4), MEM(RBX,(6*n+ 0)*8)) \
+    VBROADCASTSD(ZMM( 5), MEM(RBX,(6*n+ 1)*8)) \
+    VFMADD231PD(ZMM( 8), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM( 9), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(10), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(11), ZMM(3), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM( 4), MEM(RBX,(6*n+ 2)*8)) \
+    VFMADD231PD(ZMM(12), ZMM(0), ZMM(5)) \
+    VFMADD231PD(ZMM(13), ZMM(1), ZMM(5)) \
+    VFMADD231PD(ZMM(14), ZMM(2), ZMM(5)) \
+    VFMADD231PD(ZMM(15), ZMM(3), ZMM(5)) \
+    \
+    VBROADCASTSD(ZMM( 5), MEM(RBX,(6*n+ 3)*8)) \
+    VFMADD231PD(ZMM(16), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(17), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(18), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(19), ZMM(3), ZMM(4)) \
+    \
+    VBROADCASTSD(ZMM( 4), MEM(RBX,(6*n+ 4)*8)) \
+    VFMADD231PD(ZMM(20), ZMM(0), ZMM(5)) \
+    VFMADD231PD(ZMM(21), ZMM(1), ZMM(5)) \
+    VFMADD231PD(ZMM(22), ZMM(2), ZMM(5)) \
+    VFMADD231PD(ZMM(23), ZMM(3), ZMM(5)) \
+    \
+    VBROADCASTSD(ZMM( 5), MEM(RBX,(6*n+ 5)*8)) \
+    VFMADD231PD(ZMM(24), ZMM(0), ZMM(4)) \
+    VFMADD231PD(ZMM(25), ZMM(1), ZMM(4)) \
+    VFMADD231PD(ZMM(26), ZMM(2), ZMM(4)) \
+    VFMADD231PD(ZMM(27), ZMM(3), ZMM(4)) \
+    \
+    VFMADD231PD(ZMM(28), ZMM(0), ZMM(5)) \
+    VFMADD231PD(ZMM(29), ZMM(1), ZMM(5)) \
+    VFMADD231PD(ZMM(30), ZMM(2), ZMM(5)) \
+    VFMADD231PD(ZMM(31), ZMM(3), ZMM(5)) \
+    \
+    VMOVAPD(ZMM(0), MEM(RAX,(32*n+ 0)*8)) \
+    VMOVAPD(ZMM(1), MEM(RAX,(32*n+ 8)*8)) \
+    VMOVAPD(ZMM(2), MEM(RAX,(32*n+16)*8)) \
+    VMOVAPD(ZMM(3), MEM(RAX,(32*n+24)*8))
+#endif
+
+//This is an array used for the scatter/gather instructions.
+static int64_t offsets[32] __attribute__((aligned(64))) =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
+     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+
+void bli_dgemm_zen4_asm_32x6(
+                              dim_t            k_,
+                              double* restrict alpha,
+                              double* restrict a,
+                              double* restrict b,
+                              double* restrict beta,
+                              double* restrict c, inc_t rs_c_, inc_t cs_c_,
+                              auxinfo_t*       data,
+                              cntx_t* restrict cntx
+                            )
+{
+    (void)data;
+    (void)cntx;
+
+    const int64_t* offsetPtr = &offsets[0];
+    const int64_t k = k_;
+    const int64_t rs_c = rs_c_*8; // stride in bytes
+    const int64_t cs_c = cs_c_*8; // stride in bytes
+
+    BEGIN_ASM()
+
+    VXORPD(YMM( 4), YMM( 4), YMM( 4)) //clear out registers
+    VXORPD(YMM( 5), YMM( 5), YMM( 5)) //clear out registers
+    VMOVAPD(YMM(6) , YMM(4))
+    VMOVAPD(YMM(7) , YMM(4))
+    VMOVAPD(YMM(8) , YMM(4))
+    VMOVAPD(YMM(9) , YMM(4))
+    VXORPD(YMM(10), YMM(10), YMM(10)) //clear out registers
+    VXORPD(YMM(11), YMM(11), YMM(11)) //clear out registers
+    VMOVAPD(YMM(12), YMM(4))
+    VMOVAPD(YMM(13), YMM(4))
+    VMOVAPD(YMM(14), YMM(4))
+    VMOVAPD(YMM(15), YMM(4))
+    VXORPD(YMM(16), YMM(16), YMM(16)) //clear out registers
+    VXORPD(YMM(17), YMM(17), YMM(17)) //clear out registers
+    VMOVAPD(YMM(18), YMM(4))
+    VMOVAPD(YMM(19), YMM(4))
+    VMOVAPD(YMM(20), YMM(4))
+    VMOVAPD(YMM(21), YMM(4))
+    VXORPD(YMM(22), YMM(22), YMM(22)) //clear out registers
+    VXORPD(YMM(23), YMM(23), YMM(23)) //clear out registers
+    VMOVAPD(YMM(24), YMM(4))
+    VMOVAPD(YMM(25), YMM(4))
+    VMOVAPD(YMM(26), YMM(4))
+    VMOVAPD(YMM(27), YMM(4))
+    VXORPD(YMM(28), YMM(28), YMM(28)) //clear out registers
+    VXORPD(YMM(29), YMM(29), YMM(29)) //clear out registers
+    VMOVAPD(YMM(30), YMM(4))
+    VMOVAPD(YMM(31), YMM(4))
+
+    MOV(RSI, VAR(k)) //loop index
+    MOV(RAX, VAR(a)) //load address of a
+    MOV(RBX, VAR(b)) //load address of b
+    MOV(RCX, VAR(c)) //load address of c
+
+    //LEA(R9, MEM(RCX,63)) // c for prefetching
+    MOV(R9, RCX)
+
+    VMOVAPD(ZMM(0), MEM(RAX, 0*8)) //pre-load a
+    VMOVAPD(ZMM(1), MEM(RAX, 8*8)) //pre-load a
+    VMOVAPD(ZMM(2), MEM(RAX,16*8)) //pre-load a
+    VMOVAPD(ZMM(3), MEM(RAX,24*8)) //pre-load a
+    LEA(RAX, MEM(RAX,32*8)) //adjust a for pre-load
+
+    MOV(R12, VAR(rs_c))
+    MOV(R10, VAR(cs_c))
+
+    MOV(RDI, RSI)
+    AND(RSI, IMM(3))
+    SAR(RDI, IMM(2))
+
+    SUB(RDI, IMM(6+TAIL_NITER))
+    JLE(K_LE_80)
+
+        LOOP_ALIGN
+        LABEL(LOOP1)
+
+            SUBITER(0)
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*32*8))
+            LEA(RBX, MEM(RBX,4*6*8))
+
+
+        JNZ(LOOP1)
+
+    LABEL(K_LE_80)
+
+    ADD(RDI, IMM(6))
+    JLE(K_LE_24)
+
+        LOOP_ALIGN
+        LABEL(LOOP2)
+
+            PREFETCH(0, MEM(R9))
+            SUBITER(0)
+            PREFETCH(0, MEM(R9,64))
+            SUBITER(1)
+            PREFETCH(0, MEM(R9,128))
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            PREFETCH(0, MEM(R9,192))
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*32*8))
+            LEA(RBX, MEM(RBX,4*6*8))
+            LEA(R9, MEM(R9,R10,1))
+
+        JNZ(LOOP2)
+
+    LABEL(K_LE_24)
+
+    ADD(RDI, IMM(0+TAIL_NITER))
+    JLE(TAIL)
+
+        LOOP_ALIGN
+        LABEL(LOOP3)
+
+            SUBITER(0)
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*32*8))
+            LEA(RBX, MEM(RBX,4*6*8))
+
+        JNZ(LOOP3)
+
+    LABEL(TAIL)
+
+    TEST(RSI, RSI)
+    JZ(POSTACCUM)
+
+        LOOP_ALIGN
+        LABEL(TAIL_LOOP)
+
+            SUB(RSI, IMM(1))
+            SUBITER(0)
+            LEA(RAX, MEM(RAX,32*8))
+            LEA(RBX, MEM(RBX,6*8))
+
+        JNZ(TAIL_LOOP)
+
+    LABEL(POSTACCUM)
+
+    MOV(RAX, VAR(alpha))
+    MOV(RBX, VAR(beta))
+    VBROADCASTSD(ZMM(0), MEM(RAX))
+    VBROADCASTSD(ZMM(1), MEM(RBX))
+
+    VXORPD(YMM(2), YMM(2), YMM(2))
+
+    MOV(RAX, R12)
+    MOV(RBX, R10)
+
+    // Check if C is column stride.
+    CMP(RAX, IMM(8))
+    JNE(SCATTEREDUPDATE)
+
+        VCOMISD(XMM(1), XMM(2))
+        JE(COLSTORBZ)
+
+            UPDATE_C( 8, 9,10,11)
+            UPDATE_C(12,13,14,15)
+            UPDATE_C(16,17,18,19)
+            UPDATE_C(20,21,22,23)
+            UPDATE_C(24,25,26,27)
+            UPDATE_C(28,29,30,31)
+
+        JMP(END)
+        LABEL(COLSTORBZ)
+
+            UPDATE_C_BZ( 8, 9,10,11)
+            UPDATE_C_BZ(12,13,14,15)
+            UPDATE_C_BZ(16,17,18,19)
+            UPDATE_C_BZ(20,21,22,23)
+            UPDATE_C_BZ(24,25,26,27)
+            UPDATE_C_BZ(28,29,30,31)
+
+    JMP(END)
+    LABEL(SCATTEREDUPDATE)
+
+        VMULPD(ZMM( 8), ZMM( 8), ZMM(0))
+        VMULPD(ZMM( 9), ZMM( 9), ZMM(0))
+        VMULPD(ZMM(10), ZMM(10), ZMM(0))
+        VMULPD(ZMM(11), ZMM(11), ZMM(0))
+        VMULPD(ZMM(12), ZMM(12), ZMM(0))
+        VMULPD(ZMM(13), ZMM(13), ZMM(0))
+        VMULPD(ZMM(14), ZMM(14), ZMM(0))
+        VMULPD(ZMM(15), ZMM(15), ZMM(0))
+        VMULPD(ZMM(16), ZMM(16), ZMM(0))
+        VMULPD(ZMM(17), ZMM(17), ZMM(0))
+        VMULPD(ZMM(18), ZMM(18), ZMM(0))
+        VMULPD(ZMM(19), ZMM(19), ZMM(0))
+        VMULPD(ZMM(20), ZMM(20), ZMM(0))
+        VMULPD(ZMM(21), ZMM(21), ZMM(0))
+        VMULPD(ZMM(22), ZMM(22), ZMM(0))
+        VMULPD(ZMM(23), ZMM(23), ZMM(0))
+        VMULPD(ZMM(24), ZMM(24), ZMM(0))
+        VMULPD(ZMM(25), ZMM(25), ZMM(0))
+        VMULPD(ZMM(26), ZMM(26), ZMM(0))
+        VMULPD(ZMM(27), ZMM(27), ZMM(0))
+        VMULPD(ZMM(28), ZMM(28), ZMM(0))
+        VMULPD(ZMM(29), ZMM(29), ZMM(0))
+        VMULPD(ZMM(30), ZMM(30), ZMM(0))
+        VMULPD(ZMM(31), ZMM(31), ZMM(0))
+
+        VCOMISD(XMM(1), XMM(2))
+
+        MOV(RDI, VAR(offsetPtr))
+        VPBROADCASTQ(ZMM(0), RAX)
+        VPMULLQ(ZMM(4), ZMM(0), MEM(RDI))
+        VPMULLQ(ZMM(5), ZMM(0), MEM(RDI,64))
+        VPMULLQ(ZMM(6), ZMM(0), MEM(RDI,128))
+        VPMULLQ(ZMM(7), ZMM(0), MEM(RDI,192))
+
+        JE(SCATTERBZ)
+
+            UPDATE_C_COL_SCATTERED( 8, 9,10,11)
+            UPDATE_C_COL_SCATTERED(12,13,14,15)
+            UPDATE_C_COL_SCATTERED(16,17,18,19)
+            UPDATE_C_COL_SCATTERED(20,21,22,23)
+            UPDATE_C_COL_SCATTERED(24,25,26,27)
+            UPDATE_C_COL_SCATTERED(28,29,30,31)
+
+        JMP(END)
+        LABEL(SCATTERBZ)
+
+            UPDATE_C_BZ_COL_SCATTERED( 8, 9,10,11)
+            UPDATE_C_BZ_COL_SCATTERED(12,13,14,15)
+            UPDATE_C_BZ_COL_SCATTERED(16,17,18,19)
+            UPDATE_C_BZ_COL_SCATTERED(20,21,22,23)
+            UPDATE_C_BZ_COL_SCATTERED(24,25,26,27)
+            UPDATE_C_BZ_COL_SCATTERED(28,29,30,31)
+
+    LABEL(END)
+
+    VZEROUPPER()
+
+    END_ASM
+    (
+        : // output operands
+        : // input operands
+          [k]         "m" (k),
+          [a]         "m" (a),
+          [b]         "m" (b),
+          [alpha]     "m" (alpha),
+          [beta]      "m" (beta),
+          [c]         "m" (c),
+          [rs_c]      "m" (rs_c),
+          [cs_c]      "m" (cs_c),
+          [offsetPtr] "m" (offsetPtr)
+        : // register clobber list
+          "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+          "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
+          "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
+          "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
+          "zmm30", "zmm31", "memory"
+    )
+}
diff --git a/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c b/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c
new file mode 100644
index 0000000000..1f133dfc15
--- /dev/null
+++ b/kernels/zen4/3/bli_dgemm_zen4_asm_8x24.c
@@ -0,0 +1,712 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "bli_x86_asm_macros.h"
+// BLIS_ASM_SYNTAX_INTEL syntax is followed
+
+#define TAIL_NITER 5
+
+#define LOOP_ALIGN ALIGN32
+
+// Update C when C is general stored
+#define UPDATE_C_SCATTERED(R1,R2,R3) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    VGATHERQPD(ZMM(0) MASK_K(1), MEM(RCX,ZMM(2),1)) \
+    /*scale by beta*/ \
+    VFMADD231PD(ZMM(R1), ZMM(0), ZMM(1)) \
+    VGATHERQPD(ZMM(0) MASK_K(2), MEM(RCX,ZMM(3),1)) \
+    VFMADD231PD(ZMM(R2), ZMM(0), ZMM(1)) \
+    VGATHERQPD(ZMM(0) MASK_K(3), MEM(RCX,ZMM(4),1)) \
+    VFMADD231PD(ZMM(R3), ZMM(0), ZMM(1)) \
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    /*store c*/ \
+    VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(1), ZMM(R1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(2), ZMM(R2)) \
+    VSCATTERQPD(MEM(RCX,ZMM(4),1) MASK_K(3), ZMM(R3))  \
+    LEA(RCX, MEM(RCX,R12,1))
+
+// Update C when C is general stored and beta = 0
+#define UPDATE_C_SCATTERED_BZ(R1,R2,R3) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(1), ZMM(R1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(2), ZMM(R2)) \
+    VSCATTERQPD(MEM(RCX,ZMM(4),1) MASK_K(3), ZMM(R3))  \
+    LEA(RCX, MEM(RCX,R12,1))
+
+// 8x8 in register transpose, used for column stored C
+#define TRANSPOSE_8X8(R0, R1, R2, R3, R4, R5, R6, R7) \
+\
+    VUNPCKLPD(ZMM(6), ZMM(R0), ZMM(R1)) \
+    VUNPCKLPD(ZMM(7), ZMM(R2), ZMM(R3)) \
+    VUNPCKLPD(ZMM(2), ZMM(R4), ZMM(R5)) \
+    VUNPCKLPD(ZMM(3), ZMM(R6), ZMM(R7)) \
+    VMOVUPD(ZMM(0), ZMM(R0)) \
+    VMOVUPD(ZMM(1), ZMM(R4)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0x88)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0x88)) \
+    /*Stage3  1,5*/ \
+    VSHUFF64X2(ZMM(R0), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R4), ZMM(4), ZMM(5), IMM(0xDD)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0xDD)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0xDD)) \
+    /*Stage3  3,7*/ \
+    VUNPCKHPD(ZMM(6), ZMM(0 ), ZMM(R1)) \
+    VUNPCKHPD(ZMM(7), ZMM(R2), ZMM(R3)) \
+    VUNPCKHPD(ZMM(2), ZMM(1 ), ZMM(R5)) \
+    VUNPCKHPD(ZMM(3), ZMM(R6), ZMM(R7)) \
+    VSHUFF64X2(ZMM(R2), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R6), ZMM(4), ZMM(5), IMM(0xDD)) \
+    \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0x88)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0x88)) \
+    /*Stage3  2,6*/ \
+    VSHUFF64X2(ZMM(R1), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R5), ZMM(4), ZMM(5), IMM(0xDD)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0xDD)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0xDD)) \
+    /*Stage3  4,8*/ \
+    VSHUFF64X2(ZMM(R3), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R7), ZMM(4), ZMM(5), IMM(0xDD)) \
+
+// Update C when C is column stored
+#define UPDATE_C_COL_STORE(R0, R1, R2, R3, R4, R5, R6, R7) \
+    \
+    /* scale by alpha */\
+    VMULPD(ZMM(R0), ZMM(R0), ZMM(0)) \
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VMULPD(ZMM(R5), ZMM(R5), ZMM(0)) \
+    VMULPD(ZMM(R6), ZMM(R6), ZMM(0)) \
+    VMULPD(ZMM(R7), ZMM(R7), ZMM(0)) \
+    /*scale by beta*/\
+    VFMADD231PD(ZMM(R0), ZMM(1), MEM(RCX)) \
+    /*store c*/ \
+    VMOVUPD(MEM(RCX), ZMM(R0)) \
+    VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX, R10, 1)) \
+    VMOVUPD(MEM(RCX, R10, 1), ZMM(R1)) \
+    VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX, R10, 2)) \
+    VMOVUPD(MEM(RCX, R10, 2), ZMM(R2)) \
+    VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX, R11, 1)) \
+    VMOVUPD(MEM(RCX, R11, 1), ZMM(R3)) \
+    VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX, R10, 4)) \
+    VMOVUPD(MEM(RCX, R10, 4), ZMM(R4)) \
+    VFMADD231PD(ZMM(R5), ZMM(1), MEM(RCX, R12, 1)) \
+    VMOVUPD(MEM(RCX, R12, 1), ZMM(R5)) \
+    VFMADD231PD(ZMM(R6), ZMM(1), MEM(RCX, R11, 2)) \
+    VMOVUPD(MEM(RCX, R11, 2), ZMM(R6)) \
+    VFMADD231PD(ZMM(R7), ZMM(1), MEM(RCX, R13, 1)) \
+    VMOVUPD(MEM(RCX, R13, 1), ZMM(R7)) \
+    LEA(RCX, MEM(RCX,R10,8))
+
+// Update C when C is column stored and beta = 0
+#define UPDATE_C_COL_STORE_BZ(R0, R1, R2, R3, R4, R5, R6, R7) \
+    /* scale by alpha */\
+    VMULPD(ZMM(R0), ZMM(R0), ZMM(0)) \
+    VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
+    VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
+    VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
+    VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
+    VMULPD(ZMM(R5), ZMM(R5), ZMM(0)) \
+    VMULPD(ZMM(R6), ZMM(R6), ZMM(0)) \
+    VMULPD(ZMM(R7), ZMM(R7), ZMM(0)) \
+    VMOVUPD(MEM(RCX, 0*8*8), ZMM(R0)) \
+    VMOVUPD(MEM(RCX, 1*8*8), ZMM(R1)) \
+    VMOVUPD(MEM(RCX, 2*8*8), ZMM(R2)) \
+    VMOVUPD(MEM(RCX, 3*8*8), ZMM(R3)) \
+    VMOVUPD(MEM(RCX, 4*8*8), ZMM(R4)) \
+    VMOVUPD(MEM(RCX, 5*8*8), ZMM(R5)) \
+    VMOVUPD(MEM(RCX, 6*8*8), ZMM(R6)) \
+    VMOVUPD(MEM(RCX, 7*8*8), ZMM(R7)) \
+    LEA(RCX, MEM(RCX,R10,1))
+
+#define SUBITER(n) \
+\
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 0)*8)) \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 1)*8)) \
+    VFMADD231PD(ZMM( 8), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM( 9), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(10), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 2)*8)) \
+    VFMADD231PD(ZMM(11), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(12), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(13), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 3)*8)) \
+    VFMADD231PD(ZMM(14), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(15), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(16), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 4)*8)) \
+    VFMADD231PD(ZMM(17), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(18), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(19), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 5)*8)) \
+    VFMADD231PD(ZMM(20), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(21), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(22), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 6)*8)) \
+    VFMADD231PD(ZMM(23), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(24), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(25), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 7)*8)) \
+    VFMADD231PD(ZMM(26), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(27), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(28), ZMM(2), ZMM(6)) \
+    \
+    VFMADD231PD(ZMM(29), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(30), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(31), ZMM(2), ZMM(7)) \
+    \
+    VMOVAPD(ZMM(0), MEM(RBX,(24*n+0)*8)) \
+    VMOVAPD(ZMM(1), MEM(RBX,(24*n+8)*8)) \
+    VMOVAPD(ZMM(2), MEM(RBX,(24*n+16)*8)) \
+
+//This is an array used for the scatter/gather instructions.
+static int64_t offsets[24] __attribute__((aligned(64))) =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23};
+
+/*
+ * number of accumulation registers = 24/8 * 8 = 24     zmm8 to zmm31
+ * number of registers used for load B = 24/8 = 3       zmm0 to zmm2
+ * number of regusters used for broadcast A = 2         zmm6 and zmm7
+ */
+void bli_dgemm_zen4_asm_8x24(
+                              dim_t            k_,
+                              double* restrict alpha,
+                              double* restrict a,
+                              double* restrict b,
+                              double* restrict beta,
+                              double* restrict c, inc_t rs_c_, inc_t cs_c_,
+                              auxinfo_t*       data,
+                              cntx_t* restrict cntx
+                            )
+{
+    (void)data;
+    (void)cntx;
+
+    const int64_t* offsetPtr = &offsets[0];
+    const int64_t k = k_;
+    const int64_t rs_c = rs_c_*8; //convert strides to bytes
+    const int64_t cs_c = cs_c_*8; //convert strides to bytes
+
+    BEGIN_ASM()
+
+    VXORPD(YMM(8) , YMM(8), YMM(8))
+    VXORPD(YMM(9) , YMM(9), YMM(9))
+    VXORPD(YMM(10), YMM(10), YMM(10)) //clear out registers
+    VXORPD(YMM(11), YMM(11), YMM(11)) //clear out registers
+    VMOVAPD(YMM(12), YMM(8))
+    VMOVAPD(YMM(13), YMM(8))
+    VMOVAPD(YMM(14), YMM(8))
+    VMOVAPD(YMM(15), YMM(8))
+    VXORPD(YMM(16), YMM(16), YMM(16)) //clear out registers
+    VXORPD(YMM(17), YMM(17), YMM(17)) //clear out registers
+    VMOVAPD(YMM(18), YMM(8))
+    VMOVAPD(YMM(19), YMM(8))
+    VMOVAPD(YMM(20), YMM(8))
+    VMOVAPD(YMM(21), YMM(8))
+    VXORPD(YMM(22), YMM(22), YMM(22)) //clear out registers
+    VXORPD(YMM(23), YMM(23), YMM(23)) //clear out registers
+    VMOVAPD(YMM(24), YMM(8))
+    VMOVAPD(YMM(25), YMM(8))
+    VMOVAPD(YMM(26), YMM(8))
+    VMOVAPD(YMM(27), YMM(8))
+    VXORPD(YMM(28), YMM(28), YMM(28)) //clear out registers
+    VXORPD(YMM(29), YMM(29), YMM(29)) //clear out registers
+    VMOVAPD(YMM(30), YMM(8))
+    VMOVAPD(YMM(31), YMM(8))
+
+    MOV(RSI, VAR(k)) //loop index
+    MOV(RAX, VAR(a)) //load address of a
+    MOV(RBX, VAR(b)) //load address of b
+    MOV(RCX, VAR(c)) //load address of c
+
+    LEA(R9, MEM(RCX,63)) // c for prefetching
+
+    VMOVAPD(ZMM(0), MEM(RBX, 0*8)) //pre-load b
+    VMOVAPD(ZMM(1), MEM(RBX, 8*8)) //pre-load b
+    VMOVAPD(ZMM(2), MEM(RBX,16*8)) //pre-load b
+
+    LEA(RBX, MEM(RBX,24*8)) //adjust b for pre-load
+
+    MOV(R12, VAR(rs_c))
+    MOV(R10, VAR(cs_c))
+
+    MOV(R11, IMM(8))  // prefetch loop count
+                      // r11 = NR for row store
+                      // r11 = MR for col store
+    MOV(R8, R12)      // prefetch loop increment
+                      // r8 = cs_c for row store
+                      // r8 = rs_c for col store
+    MOV(R13, IMM(64)) // r13 = 0  for row store
+                      // r13 = 64 for col store
+    CMP(R10, IMM(8))  // jmp if c row stor
+    JZ(POST_STRIDE)
+        MOV(R8 , R10) // r8 = cs_c  -  prefetch loop increment
+        MOV(R11, IMM(24)) // r11 = 24  -  prefetch loop count
+        MOV(R13, IMM(0)) // r13 = 0
+
+    LABEL(POST_STRIDE)
+
+    MOV(RDI, RSI) // RDI = k
+    AND(RSI, IMM(3)) // RSI = k & 3, RSI = k % 4
+    SAR(RDI, IMM(2)) // RSI = k >> 2, RSI = k / 4
+
+    SUB(RDI, R11) // subtract prefetch loop count
+    SUB(RDI, IMM(0+TAIL_NITER)) // '0+' needed for preprocessor
+    JLE(K_LE_80)
+
+        LOOP_ALIGN
+        LABEL(LOOP1)
+
+            SUBITER(0)
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+
+        JNZ(LOOP1)
+
+    LABEL(K_LE_80)
+
+    ADD(RDI, R11) // add prefetch loop count
+    JLE(K_LE_26)
+
+        LOOP_ALIGN
+        LABEL(LOOP2)
+
+            PREFETCH(0, MEM(R9))
+            SUBITER(0)
+            PREFETCH(0, MEM(R9,R13, 1)) // prefetch R9+64 if col store,
+                                        // prefetch R9+0 if row store
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            PREFETCH(0, MEM(R9,R13, 2)) // prefetch R9+128 if col store,
+                                        // prefetch R9+0 if row store
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+            LEA(R9, MEM(R9,R8,1))       // r9 += rs_c if col store,
+                                        // r9 += cs_c if row store
+
+        JNZ(LOOP2)
+
+    LABEL(K_LE_26)
+
+    ADD(RDI, IMM(0+TAIL_NITER))
+    JLE(TAIL)
+
+        LOOP_ALIGN
+        LABEL(LOOP3)
+
+            SUBITER(0)
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+
+        JNZ(LOOP3)
+
+    LABEL(TAIL)
+
+    TEST(RSI, RSI)
+    JZ(POSTACCUM)
+
+        LOOP_ALIGN
+        LABEL(TAIL_LOOP)
+
+            SUB(RSI, IMM(1))
+            SUBITER(0)
+
+            LEA(RAX, MEM(RAX,8*8))
+            LEA(RBX, MEM(RBX,24*8))
+
+        JNZ(TAIL_LOOP)
+
+    LABEL(POSTACCUM)
+
+    MOV(RAX, VAR(alpha))
+    MOV(RBX, VAR(beta))
+    VBROADCASTSD(ZMM(0), MEM(RAX)) // ZMM(0) = alpha
+    VBROADCASTSD(ZMM(1), MEM(RBX)) // zmm(1) = beta
+
+    VXORPD(YMM(2), YMM(2), YMM(2))
+
+    MOV(RAX, R12) // rs_c
+    MOV(RBX, R10) // cs_c
+
+    // Check if C is column stride.
+    CMP(RAX, IMM(8))
+    JE(COLUPDATE)
+
+
+    CMP(RBX, IMM(8))
+    JE(ROWUPDATE)
+
+    LABEL(SCATTERUPDATE)
+        // if C is general stride
+        VMULPD(ZMM( 8), ZMM( 8), ZMM(0)) //scale by alpha
+        VMULPD(ZMM( 9), ZMM( 9), ZMM(0))
+        VMULPD(ZMM(10), ZMM(10), ZMM(0))
+        VMULPD(ZMM(11), ZMM(11), ZMM(0))
+        VMULPD(ZMM(12), ZMM(12), ZMM(0))
+        VMULPD(ZMM(13), ZMM(13), ZMM(0))
+        VMULPD(ZMM(14), ZMM(14), ZMM(0))
+        VMULPD(ZMM(15), ZMM(15), ZMM(0))
+        VMULPD(ZMM(16), ZMM(16), ZMM(0))
+        VMULPD(ZMM(17), ZMM(17), ZMM(0))
+        VMULPD(ZMM(18), ZMM(18), ZMM(0))
+        VMULPD(ZMM(19), ZMM(19), ZMM(0))
+        VMULPD(ZMM(20), ZMM(20), ZMM(0))
+        VMULPD(ZMM(21), ZMM(21), ZMM(0))
+        VMULPD(ZMM(22), ZMM(22), ZMM(0))
+        VMULPD(ZMM(23), ZMM(23), ZMM(0))
+        VMULPD(ZMM(24), ZMM(24), ZMM(0))
+        VMULPD(ZMM(25), ZMM(25), ZMM(0))
+        VMULPD(ZMM(26), ZMM(26), ZMM(0))
+        VMULPD(ZMM(27), ZMM(27), ZMM(0))
+        VMULPD(ZMM(28), ZMM(28), ZMM(0))
+        VMULPD(ZMM(29), ZMM(29), ZMM(0))
+        VMULPD(ZMM(30), ZMM(30), ZMM(0))
+        VMULPD(ZMM(31), ZMM(31), ZMM(0))
+
+        MOV(RDI, VAR(offsetPtr))
+        VPBROADCASTQ(ZMM(0), R10)
+        VPMULLQ(ZMM(2), ZMM(0), MEM(RDI))
+        VPMULLQ(ZMM(3), ZMM(0), MEM(RDI, 8*8))
+        VPMULLQ(ZMM(4), ZMM(0), MEM(RDI,16*8))
+        VCOMISD(XMM(1), XMM(2))
+        JE(GENSTORBZ)
+            UPDATE_C_SCATTERED( 8,  9, 10) // scale by beta and store
+            UPDATE_C_SCATTERED(11, 12, 13)
+            UPDATE_C_SCATTERED(14, 15, 16)
+            UPDATE_C_SCATTERED(17, 18, 19)
+            UPDATE_C_SCATTERED(20, 21, 22)
+            UPDATE_C_SCATTERED(23, 24, 25)
+            UPDATE_C_SCATTERED(26, 27, 28)
+            UPDATE_C_SCATTERED(29, 30, 31)
+            JMP(END)
+        LABEL(GENSTORBZ)
+            UPDATE_C_SCATTERED_BZ( 8,  9, 10)
+            UPDATE_C_SCATTERED_BZ(11, 12, 13)
+            UPDATE_C_SCATTERED_BZ(14, 15, 16)
+            UPDATE_C_SCATTERED_BZ(17, 18, 19)
+            UPDATE_C_SCATTERED_BZ(20, 21, 22)
+            UPDATE_C_SCATTERED_BZ(23, 24, 25)
+            UPDATE_C_SCATTERED_BZ(26, 27, 28)
+            UPDATE_C_SCATTERED_BZ(29, 30, 31)
+            JMP(END)
+
+    LABEL(ROWUPDATE)
+        // if C is row stride
+        // R12 = rs_c
+        LEA(R11, MEM(R12, R12, 2)) // R11 = rs_c * 3, R11 = rs_c + rs_c * 2
+        LEA(R13, MEM(R12, R11, 2)) // R13 = rs_c * 7, R13 = rs_c + R11 * 2
+        LEA(R12, MEM(R12, R12, 4)) // R12 = rs_c * 5, R12 = rs_c + rs_c * 4
+        VCOMISD(XMM(1), XMM(2)) // XMM(1) = beta, XMM(2) = 0
+        JE(ROWSTORBZ)
+            // beta != 0
+
+            // row0
+            VMULPD(ZMM( 8), ZMM( 8), ZMM(0)) // scale by alpha
+            VMULPD(ZMM( 9), ZMM( 9), ZMM(0))
+            VMULPD(ZMM(10), ZMM(10), ZMM(0))
+            /*scale by beta*/
+            VFMADD231PD(ZMM( 8), ZMM(1), MEM(RCX)) //zmm8 = zmm1*C + zmm8, zmm8 = beta*C + zmm8
+            VFMADD231PD(ZMM( 9), ZMM(1), MEM(RCX,64)) //zmm9 = beta*C + zmm9
+            VFMADD231PD(ZMM(10), ZMM(1), MEM(RCX,128)) //zmm10 = beta*C + zmm10
+            /*store c*/
+            VMOVUPD(MEM(RCX    ), ZMM( 8))
+            VMOVUPD(MEM(RCX, 64), ZMM( 9))
+            VMOVUPD(MEM(RCX,128), ZMM(10))
+
+            // row1
+            VMULPD(ZMM(11), ZMM(11), ZMM(0))
+            VMULPD(ZMM(12), ZMM(12), ZMM(0))
+            VMULPD(ZMM(13), ZMM(13), ZMM(0))
+            /*scale by beta*/
+            VFMADD231PD(ZMM(11), ZMM(1), MEM(RCX, RAX, 1     ))
+            VFMADD231PD(ZMM(12), ZMM(1), MEM(RCX, RAX, 1, 64 ))
+            VFMADD231PD(ZMM(13), ZMM(1), MEM(RCX, RAX, 1, 128))
+            /*store c*/
+            VMOVUPD(MEM(RCX, RAX, 1     ), ZMM(11))
+            VMOVUPD(MEM(RCX, RAX, 1, 64 ), ZMM(12))
+            VMOVUPD(MEM(RCX, RAX, 1, 128), ZMM(13))
+
+            // row2
+            VMULPD(ZMM(14), ZMM(14), ZMM(0))
+            VMULPD(ZMM(15), ZMM(15), ZMM(0))
+            VMULPD(ZMM(16), ZMM(16), ZMM(0))
+            /*scale by beta*/
+            VFMADD231PD(ZMM(14), ZMM(1), MEM(RCX, RAX, 2     ))
+            VFMADD231PD(ZMM(15), ZMM(1), MEM(RCX, RAX, 2, 64 ))
+            VFMADD231PD(ZMM(16), ZMM(1), MEM(RCX, RAX, 2, 128))
+            /*store c*/
+            VMOVUPD(MEM(RCX, RAX, 2     ), ZMM(14))
+            VMOVUPD(MEM(RCX, RAX, 2, 64 ), ZMM(15))
+            VMOVUPD(MEM(RCX, RAX, 2, 128), ZMM(16))
+
+            // row3
+            VMULPD(ZMM(17), ZMM(17), ZMM(0))
+            VMULPD(ZMM(18), ZMM(18), ZMM(0))
+            VMULPD(ZMM(19), ZMM(19), ZMM(0))
+            /*scale by beta*/
+            VFMADD231PD(ZMM(17), ZMM(1), MEM(RCX, R11, 1     ))
+            VFMADD231PD(ZMM(18), ZMM(1), MEM(RCX, R11, 1, 64 ))
+            VFMADD231PD(ZMM(19), ZMM(1), MEM(RCX, R11, 1, 128))
+            /*store c*/
+            VMOVUPD(MEM(RCX, R11, 1     ), ZMM(17))
+            VMOVUPD(MEM(RCX, R11, 1, 64 ), ZMM(18))
+            VMOVUPD(MEM(RCX, R11, 1, 128), ZMM(19))
+
+            // row4
+            VMULPD(ZMM(20), ZMM(20), ZMM(0))
+            VMULPD(ZMM(21), ZMM(21), ZMM(0))
+            VMULPD(ZMM(22), ZMM(22), ZMM(0))
+            /*scale by beta*/
+            VFMADD231PD(ZMM(20), ZMM(1), MEM(RCX, RAX, 4     ))
+            VFMADD231PD(ZMM(21), ZMM(1), MEM(RCX, RAX, 4, 64 ))
+            VFMADD231PD(ZMM(22), ZMM(1), MEM(RCX, RAX, 4, 128))
+            /*store c*/
+            VMOVUPD(MEM(RCX, RAX, 4     ), ZMM(20))
+            VMOVUPD(MEM(RCX, RAX, 4, 64 ), ZMM(21))
+            VMOVUPD(MEM(RCX, RAX, 4, 128), ZMM(22))
+
+            // row5
+            VMULPD(ZMM(23), ZMM(23), ZMM(0))
+            VMULPD(ZMM(24), ZMM(24), ZMM(0))
+            VMULPD(ZMM(25), ZMM(25), ZMM(0))
+            /*scale by beta*/
+            VFMADD231PD(ZMM(23), ZMM(1), MEM(RCX, R12, 1     ))
+            VFMADD231PD(ZMM(24), ZMM(1), MEM(RCX, R12, 1, 64 ))
+            VFMADD231PD(ZMM(25), ZMM(1), MEM(RCX, R12, 1, 128))
+            /*store c*/
+            VMOVUPD(MEM(RCX, R12, 1     ), ZMM(23))
+            VMOVUPD(MEM(RCX, R12, 1, 64 ), ZMM(24))
+            VMOVUPD(MEM(RCX, R12, 1, 128), ZMM(25))
+
+            // row6
+            VMULPD(ZMM(26), ZMM(26), ZMM(0))
+            VMULPD(ZMM(27), ZMM(27), ZMM(0))
+            VMULPD(ZMM(28), ZMM(28), ZMM(0))
+            /*scale by beta*/
+            VFMADD231PD(ZMM(26), ZMM(1), MEM(RCX, R11, 2     ))
+            VFMADD231PD(ZMM(27), ZMM(1), MEM(RCX, R11, 2, 64 ))
+            VFMADD231PD(ZMM(28), ZMM(1), MEM(RCX, R11, 2, 128))
+            /*store c*/
+            VMOVUPD(MEM(RCX, R11, 2     ), ZMM(26))
+            VMOVUPD(MEM(RCX, R11, 2, 64 ), ZMM(27))
+            VMOVUPD(MEM(RCX, R11, 2, 128), ZMM(28))
+
+            // row6
+            VMULPD(ZMM(29), ZMM(29), ZMM(0))
+            VMULPD(ZMM(30), ZMM(30), ZMM(0))
+            VMULPD(ZMM(31), ZMM(31), ZMM(0))
+            /*scale by beta*/
+            VFMADD231PD(ZMM(29), ZMM(1), MEM(RCX, R13, 1     ))
+            VFMADD231PD(ZMM(30), ZMM(1), MEM(RCX, R13, 1, 64 ))
+            VFMADD231PD(ZMM(31), ZMM(1), MEM(RCX, R13, 1, 128))
+            /*store c*/
+            VMOVUPD(MEM(RCX, R13, 1     ), ZMM(29))
+            VMOVUPD(MEM(RCX, R13, 1, 64 ), ZMM(30))
+            VMOVUPD(MEM(RCX, R13, 1, 128), ZMM(31))
+
+            JMP(END)
+        LABEL(ROWSTORBZ)
+            // beta == 0
+
+            // row0
+            VMULPD(ZMM( 8), ZMM( 8), ZMM(0))
+            VMULPD(ZMM( 9), ZMM( 9), ZMM(0))
+            VMULPD(ZMM(10), ZMM(10), ZMM(0))
+            /*store c*/
+            VMOVUPD(MEM(RCX    ), ZMM( 8))
+            VMOVUPD(MEM(RCX, 64), ZMM( 9))
+            VMOVUPD(MEM(RCX,128), ZMM(10))
+
+            // row1
+            VMULPD(ZMM(11), ZMM(11), ZMM(0))
+            VMULPD(ZMM(12), ZMM(12), ZMM(0))
+            VMULPD(ZMM(13), ZMM(13), ZMM(0))
+            /*store c*/
+            VMOVUPD(MEM(RCX, RAX, 1     ), ZMM(11))
+            VMOVUPD(MEM(RCX, RAX, 1, 64 ), ZMM(12))
+            VMOVUPD(MEM(RCX, RAX, 1, 128), ZMM(13))
+
+            // row2
+            VMULPD(ZMM(14), ZMM(14), ZMM(0))
+            VMULPD(ZMM(15), ZMM(15), ZMM(0))
+            VMULPD(ZMM(16), ZMM(16), ZMM(0))
+            /*store c*/
+            VMOVUPD(MEM(RCX, RAX, 2     ), ZMM(14))
+            VMOVUPD(MEM(RCX, RAX, 2, 64 ), ZMM(15))
+            VMOVUPD(MEM(RCX, RAX, 2, 128), ZMM(16))
+
+            // row3
+            VMULPD(ZMM(17), ZMM(17), ZMM(0))
+            VMULPD(ZMM(18), ZMM(18), ZMM(0))
+            VMULPD(ZMM(19), ZMM(19), ZMM(0))
+            /*store c*/
+            VMOVUPD(MEM(RCX, R11, 1     ), ZMM(17))
+            VMOVUPD(MEM(RCX, R11, 1, 64 ), ZMM(18))
+            VMOVUPD(MEM(RCX, R11, 1, 128), ZMM(19))
+
+            // row4
+            VMULPD(ZMM(20), ZMM(20), ZMM(0))
+            VMULPD(ZMM(21), ZMM(21), ZMM(0))
+            VMULPD(ZMM(22), ZMM(22), ZMM(0))
+            /*store c*/
+            VMOVUPD(MEM(RCX, RAX, 4     ), ZMM(20))
+            VMOVUPD(MEM(RCX, RAX, 4, 64 ), ZMM(21))
+            VMOVUPD(MEM(RCX, RAX, 4, 128), ZMM(22))
+
+            // row5
+            VMULPD(ZMM(23), ZMM(23), ZMM(0))
+            VMULPD(ZMM(24), ZMM(24), ZMM(0))
+            VMULPD(ZMM(25), ZMM(25), ZMM(0))
+            /*store c*/
+            VMOVUPD(MEM(RCX, R12, 1     ), ZMM(23))
+            VMOVUPD(MEM(RCX, R12, 1, 64 ), ZMM(24))
+            VMOVUPD(MEM(RCX, R12, 1, 128), ZMM(25))
+
+            // row6
+            VMULPD(ZMM(26), ZMM(26), ZMM(0))
+            VMULPD(ZMM(27), ZMM(27), ZMM(0))
+            VMULPD(ZMM(28), ZMM(28), ZMM(0))
+            /*store c*/
+            VMOVUPD(MEM(RCX, R11, 2     ), ZMM(26))
+            VMOVUPD(MEM(RCX, R11, 2, 64 ), ZMM(27))
+            VMOVUPD(MEM(RCX, R11, 2, 128), ZMM(28))
+
+            // row6
+            VMULPD(ZMM(29), ZMM(29), ZMM(0))
+            VMULPD(ZMM(30), ZMM(30), ZMM(0))
+            VMULPD(ZMM(31), ZMM(31), ZMM(0))
+            /*store c*/
+            VMOVUPD(MEM(RCX, R13, 1     ), ZMM(29))
+            VMOVUPD(MEM(RCX, R13, 1, 64 ), ZMM(30))
+            VMOVUPD(MEM(RCX, R13, 1, 128), ZMM(31))
+            JMP(END)
+
+    LABEL(COLUPDATE)
+        // if C is col stride
+        // R10 = cs_c
+        LEA(R11, MEM(R10, R10, 2)) // R11 = cs_c * 3
+        LEA(R12, MEM(R10, R10, 4)) // R12 = cs_c * 5
+        LEA(R13, MEM(R10, R11, 2)) // R13 = cs_c * 7
+
+
+        VCOMISD(XMM(1), XMM(2))
+        JE(COLSTORBZ)
+            // beta != 0
+
+            TRANSPOSE_8X8( 8, 11, 14, 17, 20, 23, 26, 29)
+            TRANSPOSE_8X8( 9, 12, 15, 18, 21, 24, 27, 30)
+            TRANSPOSE_8X8(10, 13, 16, 19, 22, 25, 28, 31)
+            MOV(RAX, VAR(alpha))
+            MOV(RBX, VAR(beta))
+            VBROADCASTSD(ZMM(0), MEM(RAX))
+            VBROADCASTSD(ZMM(1), MEM(RBX))
+
+            UPDATE_C_COL_STORE( 8, 11, 14, 17, 20, 23, 26, 29)
+            UPDATE_C_COL_STORE( 9, 12, 15, 18, 21, 24, 27, 30)
+            UPDATE_C_COL_STORE(10, 13, 16, 19, 22, 25, 28, 31)
+            JMP(END)
+
+        LABEL(COLSTORBZ)
+            // beta == 0
+
+            TRANSPOSE_8X8( 8, 11, 14, 17, 20, 23, 26, 29)
+            TRANSPOSE_8X8( 9, 12, 15, 18, 21, 24, 27, 30)
+            TRANSPOSE_8X8(10, 13, 16, 19, 22, 25, 28, 31)
+            MOV(RAX, VAR(alpha))
+            VBROADCASTSD(ZMM(0), MEM(RAX))
+
+            UPDATE_C_COL_STORE_BZ( 8, 11, 14, 17, 20, 23, 26, 29)
+            UPDATE_C_COL_STORE_BZ( 9, 12, 15, 18, 21, 24, 27, 30)
+            UPDATE_C_COL_STORE_BZ(10, 13, 16, 19, 22, 25, 28, 31)
+
+    LABEL(END)
+
+    VZEROUPPER()
+
+    END_ASM
+    (
+        : // output operands
+        : // input operands
+          [k]         "m" (k),
+          [a]         "m" (a),
+          [b]         "m" (b),
+          [alpha]     "m" (alpha),
+          [beta]      "m" (beta),
+          [c]         "m" (c),
+          [rs_c]      "m" (rs_c),
+          [cs_c]      "m" (cs_c),
+          [offsetPtr] "m" (offsetPtr)
+        : // register clobber list
+          "rax", "rbx", "rcx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
+          "r13", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+          "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
+          "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
+          "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
+          "zmm30", "zmm31", "memory"
+    )
+}
diff --git a/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c b/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c
new file mode 100644
index 0000000000..139edc7ddb
--- /dev/null
+++ b/kernels/zen4/3/bli_gemmtrsm_l_zen4_8x24.c
@@ -0,0 +1,821 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "bli_x86_asm_macros.h"
+// BLIS_ASM_SYNTAX_INTEL syntax is followed
+
+#define TAIL_NITER 5
+
+#define LOOP_ALIGN ALIGN32
+
+// Update C when C is general stored
+#define UPDATE_C_SCATTERED(R1,R2,R3) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(1), ZMM(R1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(2), ZMM(R2)) \
+    VSCATTERQPD(MEM(RCX,ZMM(4),1) MASK_K(3), ZMM(R3))  \
+    LEA(RCX, MEM(RCX,R12,1))
+
+// 8x8 in register transpose, used for column stored C
+#define TRANSPOSE_8X8(R0, R1, R2, R3, R4, R5, R6, R7) \
+\
+    VUNPCKLPD(ZMM(6), ZMM(R0), ZMM(R1)) \
+    VUNPCKLPD(ZMM(7), ZMM(R2), ZMM(R3)) \
+    VUNPCKLPD(ZMM(2), ZMM(R4), ZMM(R5)) \
+    VUNPCKLPD(ZMM(3), ZMM(R6), ZMM(R7)) \
+    VMOVUPD(ZMM(0), ZMM(R0)) \
+    VMOVUPD(ZMM(1), ZMM(R4)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0x88)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0x88)) \
+    /*Stage3  1,5*/ \
+    VSHUFF64X2(ZMM(R0), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R4), ZMM(4), ZMM(5), IMM(0xDD)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0xDD)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0xDD)) \
+    /*Stage3  3,7*/ \
+    VUNPCKHPD(ZMM(6), ZMM(0 ), ZMM(R1)) \
+    VUNPCKHPD(ZMM(7), ZMM(R2), ZMM(R3)) \
+    VUNPCKHPD(ZMM(2), ZMM(1 ), ZMM(R5)) \
+    VUNPCKHPD(ZMM(3), ZMM(R6), ZMM(R7)) \
+    VSHUFF64X2(ZMM(R2), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R6), ZMM(4), ZMM(5), IMM(0xDD)) \
+    \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0x88)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0x88)) \
+    /*Stage3  2,6*/ \
+    VSHUFF64X2(ZMM(R1), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R5), ZMM(4), ZMM(5), IMM(0xDD)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0xDD)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0xDD)) \
+    /*Stage3  4,8*/ \
+    VSHUFF64X2(ZMM(R3), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R7), ZMM(4), ZMM(5), IMM(0xDD)) \
+
+// Update C when C is column stored
+#define UPDATE_C_COL_STORE(R0, R1, R2, R3, R4, R5, R6, R7) \
+    \
+    VMOVUPD(MEM(RCX), ZMM(R0)) \
+    VMOVUPD(MEM(RCX, R10, 1), ZMM(R1)) \
+    VMOVUPD(MEM(RCX, R10, 2), ZMM(R2)) \
+    VMOVUPD(MEM(RCX, R11, 1), ZMM(R3)) \
+    VMOVUPD(MEM(RCX, R10, 4), ZMM(R4)) \
+    VMOVUPD(MEM(RCX, R12, 1), ZMM(R5)) \
+    VMOVUPD(MEM(RCX, R11, 2), ZMM(R6)) \
+    VMOVUPD(MEM(RCX, R13, 1), ZMM(R7)) \
+    LEA(RCX, MEM(RCX,R10,8))
+
+#define SUBITER(n) \
+\
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 0)*8)) \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 1)*8)) \
+    VFMADD231PD(ZMM( 8), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM( 9), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(10), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 2)*8)) \
+    VFMADD231PD(ZMM(11), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(12), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(13), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 3)*8)) \
+    VFMADD231PD(ZMM(14), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(15), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(16), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 4)*8)) \
+    VFMADD231PD(ZMM(17), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(18), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(19), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 5)*8)) \
+    VFMADD231PD(ZMM(20), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(21), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(22), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 6)*8)) \
+    VFMADD231PD(ZMM(23), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(24), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(25), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 7)*8)) \
+    VFMADD231PD(ZMM(26), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(27), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(28), ZMM(2), ZMM(6)) \
+    \
+    VFMADD231PD(ZMM(29), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(30), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(31), ZMM(2), ZMM(7)) \
+    \
+    VMOVAPD(ZMM(0), MEM(RBX,(24*n+0)*8)) \
+    VMOVAPD(ZMM(1), MEM(RBX,(24*n+8)*8)) \
+    VMOVAPD(ZMM(2), MEM(RBX,(24*n+16)*8)) \
+
+//This is an array used for the scatter/gather instructions.
+static int64_t offsets[24] __attribute__((aligned(64))) =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23};
+
+/*
+ * number of accumulation registers = 24/8 * 8 = 24     zmm8 to zmm31
+ * number of registers used for load B = 24/8 = 3       zmm0 to zmm2
+ * number of regusters used for broadcast A = 2         zmm6 and zmm7
+ */
+void bli_dgemmtrsm_l_zen4_asm_8x24
+     (
+       dim_t               k_,
+       double*    restrict alpha,
+       double*    restrict a10,
+       double*    restrict a11,
+       double*    restrict b01,
+       double*    restrict b11,
+       double*    restrict c11, inc_t rs_c_, inc_t cs_c_,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    const int64_t* offsetPtr = &offsets[0];
+    const int64_t k = k_;
+    const int64_t rs_c = rs_c_*8; //convert strides to bytes
+    const int64_t cs_c = cs_c_*8; //convert strides to bytes
+
+    BEGIN_ASM()
+
+    VXORPD(YMM(8) , YMM(8), YMM(8))
+    VXORPD(YMM(9) , YMM(9), YMM(9))
+    VXORPD(YMM(10), YMM(10), YMM(10)) //clear out registers
+    VXORPD(YMM(11), YMM(11), YMM(11)) //clear out registers
+    VMOVAPD(YMM(12), YMM(8))
+    VMOVAPD(YMM(13), YMM(8))
+    VMOVAPD(YMM(14), YMM(8))
+    VMOVAPD(YMM(15), YMM(8))
+    VXORPD(YMM(16), YMM(16), YMM(16)) //clear out registers
+    VXORPD(YMM(17), YMM(17), YMM(17)) //clear out registers
+    VMOVAPD(YMM(18), YMM(8))
+    VMOVAPD(YMM(19), YMM(8))
+    VMOVAPD(YMM(20), YMM(8))
+    VMOVAPD(YMM(21), YMM(8))
+    VXORPD(YMM(22), YMM(22), YMM(22)) //clear out registers
+    VXORPD(YMM(23), YMM(23), YMM(23)) //clear out registers
+    VMOVAPD(YMM(24), YMM(8))
+    VMOVAPD(YMM(25), YMM(8))
+    VMOVAPD(YMM(26), YMM(8))
+    VMOVAPD(YMM(27), YMM(8))
+    VXORPD(YMM(28), YMM(28), YMM(28)) //clear out registers
+    VXORPD(YMM(29), YMM(29), YMM(29)) //clear out registers
+    VMOVAPD(YMM(30), YMM(8))
+    VMOVAPD(YMM(31), YMM(8))
+
+    MOV(RSI, VAR(k)) //loop index
+    MOV(RAX, VAR(a10)) //load address of a
+    MOV(RBX, VAR(b01)) //load address of b
+    MOV(RCX, VAR(b11)) //load address of c
+    MOV(R15, VAR(c11))
+
+    LEA(R9, MEM(R15,63)) // c for prefetching
+
+    VMOVAPD(ZMM(0), MEM(RBX, 0*8)) //pre-load b
+    VMOVAPD(ZMM(1), MEM(RBX, 8*8)) //pre-load b
+    VMOVAPD(ZMM(2), MEM(RBX,16*8)) //pre-load b
+
+    LEA(RBX, MEM(RBX,24*8)) //adjust b for pre-load
+
+    MOV(R12, VAR(rs_c))
+    MOV(R10, VAR(cs_c))
+
+    MOV(R11, IMM(8))  // prefetch loop count
+                      // r11 = NR for row store
+                      // r11 = MR for col store
+    MOV(R8, R12)      // prefetch loop increment
+                      // r8 = cs_c for row store
+                      // r8 = rs_c for col store
+    MOV(R13, IMM(64)) // r13 = 0  for row store
+                      // r13 = 64 for col store
+    CMP(R10, IMM(8))  // jmp if c row stor
+    JZ(POST_STRIDE)
+        MOV(R8 , R10) // r8 = cs_c  -  prefetch loop increment
+        MOV(R11, IMM(24)) // r11 = 24  -  prefetch loop count
+        MOV(R13, IMM(0)) // r13 = 0
+
+    LABEL(POST_STRIDE)
+
+    MOV(RDI, RSI) // RDI = k
+    AND(RSI, IMM(3)) // RSI = k & 3, RSI = k % 4
+    SAR(RDI, IMM(2)) // RSI = k >> 2, RSI = k / 4
+
+    SUB(RDI, R11) // subtract prefetch loop count
+    SUB(RDI, IMM(0+TAIL_NITER)) // '0+' needed for preprocessor
+    JLE(K_LE_80)
+
+        LOOP_ALIGN
+        LABEL(LOOP1)
+
+            SUBITER(0)
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+
+        JNZ(LOOP1)
+
+    LABEL(K_LE_80)
+
+    ADD(RDI, R11) // add prefetch loop count
+    JLE(K_LE_26)
+
+        LOOP_ALIGN
+        LABEL(LOOP2)
+
+            PREFETCH(0, MEM(R9))
+            SUBITER(0)
+            PREFETCH(0, MEM(R9,R13, 1)) // prefetch R9+64 if col store,
+                                        // prefetch R9+0 if row store
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            PREFETCH(0, MEM(R9,R13, 2)) // prefetch R9+128 if col store,
+                                        // prefetch R9+0 if row store
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+            LEA(R9, MEM(R9,R8,1))       // r9 += rs_c if col store,
+                                        // r9 += cs_c if row store
+
+        JNZ(LOOP2)
+
+    LABEL(K_LE_26)
+
+    ADD(RDI, IMM(0+TAIL_NITER))
+    JLE(TAIL)
+
+        LOOP_ALIGN
+        LABEL(LOOP3)
+
+            SUBITER(0)
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+
+        JNZ(LOOP3)
+
+    LABEL(TAIL)
+
+    TEST(RSI, RSI)
+    JZ(POSTACCUM)
+
+        LOOP_ALIGN
+        LABEL(TAIL_LOOP)
+
+            SUB(RSI, IMM(1))
+            SUBITER(0)
+
+            LEA(RAX, MEM(RAX,8*8))
+            LEA(RBX, MEM(RBX,24*8))
+
+        JNZ(TAIL_LOOP)
+
+    LABEL(POSTACCUM)
+
+    MOV(RBX, VAR(alpha))
+    VBROADCASTSD(ZMM(3), MEM(RBX))
+    MOV(RSI, IMM(1*8))
+
+    LEA(RAX, MEM(RCX, RSI, 8))
+    LEA(RDX, MEM(RAX, RSI, 8))
+
+    MOV(R13, RCX)
+    MOV(R14, RAX)
+    MOV(R15, RDX)
+
+// #region - TRSM
+    MOV(RDI, IMM(24 * 8))
+
+    VFMSUB231PD(ZMM(8 ), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(9 ), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(10), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(11), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(12), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(13), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(14), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(15), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(16), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(17), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(18), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(19), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(20), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(21), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(22), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(23), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(24), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(25), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(26), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(27), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(28), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(29), ZMM(3), MEM(RCX))
+    VFMSUB231PD(ZMM(30), ZMM(3), MEM(RAX))
+    VFMSUB231PD(ZMM(31), ZMM(3), MEM(RDX))
+
+
+    MOV(RAX, VAR(a11))
+    //iteration 0 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (0+0*8)*8))
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(8 ), ZMM(8 ), ZMM(0))
+        VMULPD(ZMM(9 ), ZMM(9 ), ZMM(0))
+        VMULPD(ZMM(10), ZMM(10), ZMM(0))
+    #else
+        VDIVPD(ZMM(8 ), ZMM(8 ), ZMM(0))
+        VDIVPD(ZMM(9 ), ZMM(9 ), ZMM(0))
+        VDIVPD(ZMM(10), ZMM(10), ZMM(0))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(8 ))
+    ADD(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(9 ))
+    ADD(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(10))
+    ADD(R15, RDI)
+
+    //iteration 1 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (1+0*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (1+1*8)*8))
+
+    VMULPD(ZMM(2), ZMM(8 ), ZMM(0))
+    VMULPD(ZMM(3), ZMM(9 ), ZMM(0))
+    VMULPD(ZMM(4), ZMM(10), ZMM(0))
+
+    VSUBPD(ZMM(11), ZMM(11), ZMM(2))
+    VSUBPD(ZMM(12), ZMM(12), ZMM(3))
+    VSUBPD(ZMM(13), ZMM(13), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(11), ZMM(11), ZMM(1))
+        VMULPD(ZMM(12), ZMM(12), ZMM(1))
+        VMULPD(ZMM(13), ZMM(13), ZMM(1))
+    #else
+        VDIVPD(ZMM(11), ZMM(11), ZMM(1))
+        VDIVPD(ZMM(12), ZMM(12), ZMM(1))
+        VDIVPD(ZMM(13), ZMM(13), ZMM(1))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(11))
+    ADD(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(12))
+    ADD(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(13))
+    ADD(R15, RDI)
+
+    //iteration 2 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (2+0*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (2+1*8)*8))
+
+    VMULPD(ZMM(2), ZMM(8 ), ZMM(0))
+    VMULPD(ZMM(3), ZMM(9 ), ZMM(0))
+    VMULPD(ZMM(4), ZMM(10), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (2+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(11), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(12), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(13), ZMM(1))
+
+    VSUBPD(ZMM(14), ZMM(14), ZMM(2))
+    VSUBPD(ZMM(15), ZMM(15), ZMM(3))
+    VSUBPD(ZMM(16), ZMM(16), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(14), ZMM(14), ZMM(0))
+        VMULPD(ZMM(15), ZMM(15), ZMM(0))
+        VMULPD(ZMM(16), ZMM(16), ZMM(0))
+    #else
+        VDIVPD(ZMM(14), ZMM(14), ZMM(0))
+        VDIVPD(ZMM(15), ZMM(15), ZMM(0))
+        VDIVPD(ZMM(16), ZMM(16), ZMM(0))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(14))
+    ADD(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(15))
+    ADD(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(16))
+    ADD(R15, RDI)
+
+    //iteration 3 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (3+0*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (3+1*8)*8))
+
+    VMULPD(ZMM(2), ZMM(8 ), ZMM(0))
+    VMULPD(ZMM(3), ZMM(9 ), ZMM(0))
+    VMULPD(ZMM(4), ZMM(10), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (3+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(11), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(12), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(13), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (3+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(14), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(15), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(16), ZMM(0))
+
+    VSUBPD(ZMM(17), ZMM(17), ZMM(2))
+    VSUBPD(ZMM(18), ZMM(18), ZMM(3))
+    VSUBPD(ZMM(19), ZMM(19), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(17), ZMM(17), ZMM(1))
+        VMULPD(ZMM(18), ZMM(18), ZMM(1))
+        VMULPD(ZMM(19), ZMM(19), ZMM(1))
+    #else
+        VDIVPD(ZMM(17), ZMM(17), ZMM(1))
+        VDIVPD(ZMM(18), ZMM(18), ZMM(1))
+        VDIVPD(ZMM(19), ZMM(19), ZMM(1))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(17))
+    ADD(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(18))
+    ADD(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(19))
+    ADD(R15, RDI)
+
+    //iteration 4 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (4+0*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (4+1*8)*8))
+
+    VMULPD(ZMM(2), ZMM(8 ), ZMM(0))
+    VMULPD(ZMM(3), ZMM(9 ), ZMM(0))
+    VMULPD(ZMM(4), ZMM(10), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (4+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(11), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(12), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(13), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (4+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(14), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(15), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(16), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (4+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(17), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(18), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(19), ZMM(1))
+
+    VSUBPD(ZMM(20), ZMM(20), ZMM(2))
+    VSUBPD(ZMM(21), ZMM(21), ZMM(3))
+    VSUBPD(ZMM(22), ZMM(22), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(20), ZMM(20), ZMM(0))
+        VMULPD(ZMM(21), ZMM(21), ZMM(0))
+        VMULPD(ZMM(22), ZMM(22), ZMM(0))
+    #else
+        VDIVPD(ZMM(20), ZMM(20), ZMM(0))
+        VDIVPD(ZMM(21), ZMM(21), ZMM(0))
+        VDIVPD(ZMM(22), ZMM(22), ZMM(0))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(20))
+    ADD(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(21))
+    ADD(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(22))
+    ADD(R15, RDI)
+
+    //iteration 5 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (5+0*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (5+1*8)*8))
+
+    VMULPD(ZMM(2), ZMM(8 ), ZMM(0))
+    VMULPD(ZMM(3), ZMM(9 ), ZMM(0))
+    VMULPD(ZMM(4), ZMM(10), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (5+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(11), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(12), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(13), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (5+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(14), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(15), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(16), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (5+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(17), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(18), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(19), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (5+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(20), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(21), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(22), ZMM(0))
+
+    VSUBPD(ZMM(23), ZMM(23), ZMM(2))
+    VSUBPD(ZMM(24), ZMM(24), ZMM(3))
+    VSUBPD(ZMM(25), ZMM(25), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(23), ZMM(23), ZMM(1))
+        VMULPD(ZMM(24), ZMM(24), ZMM(1))
+        VMULPD(ZMM(25), ZMM(25), ZMM(1))
+    #else
+        VDIVPD(ZMM(23), ZMM(23), ZMM(1))
+        VDIVPD(ZMM(24), ZMM(24), ZMM(1))
+        VDIVPD(ZMM(25), ZMM(25), ZMM(1))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(23))
+    ADD(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(24))
+    ADD(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(25))
+    ADD(R15, RDI)
+
+    //iteration 6 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (6+0*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (6+1*8)*8))
+
+    VMULPD(ZMM(2), ZMM(8 ), ZMM(0))
+    VMULPD(ZMM(3), ZMM(9 ), ZMM(0))
+    VMULPD(ZMM(4), ZMM(10), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (6+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(11), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(12), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(13), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (6+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(14), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(15), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(16), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (6+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(17), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(18), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(19), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (6+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(20), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(21), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(22), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (6+6*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(23), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(24), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(25), ZMM(1))
+
+    VSUBPD(ZMM(26), ZMM(26), ZMM(2))
+    VSUBPD(ZMM(27), ZMM(27), ZMM(3))
+    VSUBPD(ZMM(28), ZMM(28), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(26), ZMM(26), ZMM(0))
+        VMULPD(ZMM(27), ZMM(27), ZMM(0))
+        VMULPD(ZMM(28), ZMM(28), ZMM(0))
+    #else
+        VDIVPD(ZMM(26), ZMM(26), ZMM(0))
+        VDIVPD(ZMM(27), ZMM(27), ZMM(0))
+        VDIVPD(ZMM(28), ZMM(28), ZMM(0))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(26))
+    ADD(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(27))
+    ADD(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(28))
+    ADD(R15, RDI)
+
+    //iteration 7 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (7+0*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (7+1*8)*8))
+
+    VMULPD(ZMM(2), ZMM(8 ), ZMM(0))
+    VMULPD(ZMM(3), ZMM(9 ), ZMM(0))
+    VMULPD(ZMM(4), ZMM(10), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (7+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(11), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(12), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(13), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (7+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(14), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(15), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(16), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (7+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(17), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(18), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(19), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (7+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(20), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(21), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(22), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (7+6*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(23), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(24), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(25), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (7+7*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(26), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(27), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(28), ZMM(0))
+
+    VSUBPD(ZMM(29), ZMM(29), ZMM(2))
+    VSUBPD(ZMM(30), ZMM(30), ZMM(3))
+    VSUBPD(ZMM(31), ZMM(31), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(29), ZMM(29), ZMM(1))
+        VMULPD(ZMM(30), ZMM(30), ZMM(1))
+        VMULPD(ZMM(31), ZMM(31), ZMM(1))
+    #else
+        VDIVPD(ZMM(29), ZMM(29), ZMM(1))
+        VDIVPD(ZMM(30), ZMM(30), ZMM(1))
+        VDIVPD(ZMM(31), ZMM(31), ZMM(1))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(29))
+    ADD(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(30))
+    ADD(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(31))
+    ADD(R15, RDI)
+
+// #endregion - trsm
+
+    MOV(RAX, R12)
+    MOV(RBX, R10)
+    MOV(RCX, VAR(c11))
+
+    CMP(RAX, IMM(8))
+    JE(COLUPDATE)
+
+    CMP(RBX, IMM(8))
+    JE(ROWUPDATE)
+
+    LABEL(SCATTERUPDATE)
+        MOV(RDI, VAR(offsetPtr))
+        VPBROADCASTQ(ZMM(0), R10)
+        VPMULLQ(ZMM(2), ZMM(0), MEM(RDI))
+        VPMULLQ(ZMM(3), ZMM(0), MEM(RDI, 8*8))
+        VPMULLQ(ZMM(4), ZMM(0), MEM(RDI,16*8))
+        UPDATE_C_SCATTERED( 8,  9, 10)
+        UPDATE_C_SCATTERED(11, 12, 13)
+        UPDATE_C_SCATTERED(14, 15, 16)
+        UPDATE_C_SCATTERED(17, 18, 19)
+        UPDATE_C_SCATTERED(20, 21, 22)
+        UPDATE_C_SCATTERED(23, 24, 25)
+        UPDATE_C_SCATTERED(26, 27, 28)
+        UPDATE_C_SCATTERED(29, 30, 31)
+        JMP(END)
+    LABEL(ROWUPDATE)
+        LEA(R11, MEM(RAX, RAX, 2)) //R11 = rs_c * 3, R11 = rs_c + rs_c * 2
+        LEA(R12, MEM(RAX, RAX, 4)) //R12 = rs_c * 5, R12 = rs_c + rs_c * 4
+        LEA(R13, MEM(RAX, R11, 2)) //R13 = rs_c * 7, R13 = rs_c + R11 * 2
+
+        // ROW0
+        VMOVUPD(MEM(RCX     ), ZMM( 8))
+        VMOVUPD(MEM(RCX, 64 ), ZMM( 9))
+        VMOVUPD(MEM(RCX, 128), ZMM(10))
+
+        // ROW1
+        VMOVUPD(MEM(RCX, RAX, 1,    ), ZMM(11))
+        VMOVUPD(MEM(RCX, RAX, 1, 64 ), ZMM(12))
+        VMOVUPD(MEM(RCX, RAX, 1, 128), ZMM(13))
+
+        // ROW2
+        VMOVUPD(MEM(RCX, RAX, 2,    ), ZMM(14))
+        VMOVUPD(MEM(RCX, RAX, 2, 64 ), ZMM(15))
+        VMOVUPD(MEM(RCX, RAX, 2, 128), ZMM(16))
+
+        // ROW3
+        VMOVUPD(MEM(RCX, R11, 1,    ), ZMM(17))
+        VMOVUPD(MEM(RCX, R11, 1, 64 ), ZMM(18))
+        VMOVUPD(MEM(RCX, R11, 1, 128), ZMM(19))
+
+        // ROW4
+        VMOVUPD(MEM(RCX, RAX, 4,    ), ZMM(20))
+        VMOVUPD(MEM(RCX, RAX, 4, 64 ), ZMM(21))
+        VMOVUPD(MEM(RCX, RAX, 4, 128), ZMM(22))
+
+        // ROW5
+        VMOVUPD(MEM(RCX, R12, 1,    ), ZMM(23))
+        VMOVUPD(MEM(RCX, R12, 1, 64 ), ZMM(24))
+        VMOVUPD(MEM(RCX, R12, 1, 128), ZMM(25))
+
+        // ROW6
+        VMOVUPD(MEM(RCX, R11, 2,    ), ZMM(26))
+        VMOVUPD(MEM(RCX, R11, 2, 64 ), ZMM(27))
+        VMOVUPD(MEM(RCX, R11, 2, 128), ZMM(28))
+
+        // ROW7
+        VMOVUPD(MEM(RCX, R13, 1,    ), ZMM(29))
+        VMOVUPD(MEM(RCX, R13, 1, 64 ), ZMM(30))
+        VMOVUPD(MEM(RCX, R13, 1, 128), ZMM(31))
+        JMP(END)
+
+    LABEL(COLUPDATE)
+        LEA(R11, MEM(R10, R10, 2)) //R11 = cs_c * 3
+        LEA(R12, MEM(R10, R10, 4)) //R12 = cs_c * 5
+        LEA(R13, MEM(R10, R11, 2)) //R13 = cs_c * 7
+        TRANSPOSE_8X8( 8, 11, 14, 17, 20, 23, 26, 29)
+        TRANSPOSE_8X8( 9, 12, 15, 18, 21, 24, 27, 30)
+        TRANSPOSE_8X8(10, 13, 16, 19, 22, 25, 28, 31)
+        UPDATE_C_COL_STORE( 8, 11, 14, 17, 20, 23, 26, 29)
+        UPDATE_C_COL_STORE( 9, 12, 15, 18, 21, 24, 27, 30)
+        UPDATE_C_COL_STORE(10, 13, 16, 19, 22, 25, 28, 31)
+
+    LABEL(END)
+
+    VZEROUPPER()
+
+    END_ASM(
+        : // output operands (none)
+        : // input operands
+        [a10]       "m" (a10),
+        [k]         "m" (k),
+        [b01]       "m" (b01),
+        [a11]       "m" (a11),
+        [b11]       "m" (b11),
+        [c11]       "m" (c11),
+        [rs_c]      "m" (rs_c),
+        [cs_c]      "m" (cs_c),
+        [alpha]     "m" (alpha),
+        [offsetPtr] "m" (offsetPtr)
+        : // register clobber list
+          "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+          "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
+          "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
+          "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
+          "zmm30", "zmm31", "memory"
+    )
+}
diff --git a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c
index 3680c44b05..08edcb574f 100644
--- a/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c
+++ b/kernels/zen4/3/bli_gemmtrsm_l_zen_16x14.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -1659,11 +1659,15 @@ void bli_dgemmtrsm_l_zen_asm_16x14
         : // register clobber list
           "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
           "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+          "xmm0", "xmm1",
+          "ymm0", "ymm1", "ymm4", "ymm6", "ymm20", "ymm21", "ymm22", "ymm23",
+          "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31",
           "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
           "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
           "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
-          "zmm30", "zmm31", "memory"
+          "zmm30", "zmm31",
+          "k0", "k1", "k2", "k3", "k4", "memory"
         )
 
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9);
-}
\ No newline at end of file
+}
diff --git a/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c b/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c
new file mode 100644
index 0000000000..d1ea0109d7
--- /dev/null
+++ b/kernels/zen4/3/bli_gemmtrsm_u_zen4_8x24.c
@@ -0,0 +1,826 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#include "bli_x86_asm_macros.h"
+// BLIS_ASM_SYNTAX_INTEL syntax is followed
+
+#define TAIL_NITER 5
+
+#define LOOP_ALIGN ALIGN32
+
+// Update C when C is general stored
+#define UPDATE_C_SCATTERED(R1,R2,R3) \
+\
+    KXNORW(K(1), K(0), K(0)) \
+    KXNORW(K(2), K(0), K(0)) \
+    KXNORW(K(3), K(0), K(0)) \
+    VSCATTERQPD(MEM(RCX,ZMM(2),1) MASK_K(1), ZMM(R1)) \
+    VSCATTERQPD(MEM(RCX,ZMM(3),1) MASK_K(2), ZMM(R2)) \
+    VSCATTERQPD(MEM(RCX,ZMM(4),1) MASK_K(3), ZMM(R3))  \
+    LEA(RCX, MEM(RCX,R12,1))
+
+// 8x8 in register transpose, used for column stored C
+#define TRANSPOSE_8X8(R0, R1, R2, R3, R4, R5, R6, R7) \
+\
+    VUNPCKLPD(ZMM(6), ZMM(R0), ZMM(R1)) \
+    VUNPCKLPD(ZMM(7), ZMM(R2), ZMM(R3)) \
+    VUNPCKLPD(ZMM(2), ZMM(R4), ZMM(R5)) \
+    VUNPCKLPD(ZMM(3), ZMM(R6), ZMM(R7)) \
+    VMOVUPD(ZMM(0), ZMM(R0)) \
+    VMOVUPD(ZMM(1), ZMM(R4)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0x88)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0x88)) \
+    /*Stage3  1,5*/ \
+    VSHUFF64X2(ZMM(R0), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R4), ZMM(4), ZMM(5), IMM(0xDD)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0xDD)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0xDD)) \
+    /*Stage3  3,7*/ \
+    VUNPCKHPD(ZMM(6), ZMM(0 ), ZMM(R1)) \
+    VUNPCKHPD(ZMM(7), ZMM(R2), ZMM(R3)) \
+    VUNPCKHPD(ZMM(2), ZMM(1 ), ZMM(R5)) \
+    VUNPCKHPD(ZMM(3), ZMM(R6), ZMM(R7)) \
+    VSHUFF64X2(ZMM(R2), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R6), ZMM(4), ZMM(5), IMM(0xDD)) \
+    \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0x88)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0x88)) \
+    /*Stage3  2,6*/ \
+    VSHUFF64X2(ZMM(R1), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R5), ZMM(4), ZMM(5), IMM(0xDD)) \
+    /*Stage2*/ \
+    VSHUFF64X2(ZMM(4), ZMM(6), ZMM(7), IMM(0xDD)) \
+    VSHUFF64X2(ZMM(5), ZMM(2), ZMM(3), IMM(0xDD)) \
+    /*Stage3  4,8*/ \
+    VSHUFF64X2(ZMM(R3), ZMM(4), ZMM(5), IMM(0x88)) \
+    VSHUFF64X2(ZMM(R7), ZMM(4), ZMM(5), IMM(0xDD)) \
+
+// Update C when C is column stored
+#define UPDATE_C_COL_STORE(R0, R1, R2, R3, R4, R5, R6, R7) \
+    \
+    VMOVUPD(MEM(RCX), ZMM(R0)) \
+    VMOVUPD(MEM(RCX, R10, 1), ZMM(R1)) \
+    VMOVUPD(MEM(RCX, R10, 2), ZMM(R2)) \
+    VMOVUPD(MEM(RCX, R11, 1), ZMM(R3)) \
+    VMOVUPD(MEM(RCX, R10, 4), ZMM(R4)) \
+    VMOVUPD(MEM(RCX, R12, 1), ZMM(R5)) \
+    VMOVUPD(MEM(RCX, R11, 2), ZMM(R6)) \
+    VMOVUPD(MEM(RCX, R13, 1), ZMM(R7)) \
+    LEA(RCX, MEM(RCX,R10,8))
+
+#define SUBITER(n) \
+\
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 0)*8)) \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 1)*8)) \
+    VFMADD231PD(ZMM( 8), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM( 9), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(10), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 2)*8)) \
+    VFMADD231PD(ZMM(11), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(12), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(13), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 3)*8)) \
+    VFMADD231PD(ZMM(14), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(15), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(16), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 4)*8)) \
+    VFMADD231PD(ZMM(17), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(18), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(19), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 5)*8)) \
+    VFMADD231PD(ZMM(20), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(21), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(22), ZMM(2), ZMM(6)) \
+    \
+    VBROADCASTSD(ZMM(6), MEM(RAX,(8*n+ 6)*8)) \
+    VFMADD231PD(ZMM(23), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(24), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(25), ZMM(2), ZMM(7)) \
+    \
+    VBROADCASTSD(ZMM(7), MEM(RAX,(8*n+ 7)*8)) \
+    VFMADD231PD(ZMM(26), ZMM(0), ZMM(6)) \
+    VFMADD231PD(ZMM(27), ZMM(1), ZMM(6)) \
+    VFMADD231PD(ZMM(28), ZMM(2), ZMM(6)) \
+    \
+    VFMADD231PD(ZMM(29), ZMM(0), ZMM(7)) \
+    VFMADD231PD(ZMM(30), ZMM(1), ZMM(7)) \
+    VFMADD231PD(ZMM(31), ZMM(2), ZMM(7)) \
+    \
+    VMOVAPD(ZMM(0), MEM(RBX,(24*n+0)*8)) \
+    VMOVAPD(ZMM(1), MEM(RBX,(24*n+8)*8)) \
+    VMOVAPD(ZMM(2), MEM(RBX,(24*n+16)*8)) \
+
+//This is an array used for the scatter/gather instructions.
+static int64_t offsets[24] __attribute__((aligned(64))) =
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23};
+
+/*
+ * number of accumulation registers = 24/8 * 8 = 24     zmm8 to zmm31
+ * number of registers used for load B = 24/8 = 3       zmm0 to zmm2
+ * number of regusters used for broadcast A = 2         zmm6 and zmm7
+ */
+void bli_dgemmtrsm_u_zen4_asm_8x24
+     (
+       dim_t               k_,
+       double*    restrict alpha,
+       double*    restrict a10,
+       double*    restrict a11,
+       double*    restrict b01,
+       double*    restrict b11,
+       double*    restrict c11, inc_t rs_c_, inc_t cs_c_,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    const int64_t* offsetPtr = &offsets[0];
+    const int64_t k = k_;
+    const int64_t rs_c = rs_c_*8; //convert strides to bytes
+    const int64_t cs_c = cs_c_*8; //convert strides to bytes
+
+    BEGIN_ASM()
+
+    VXORPD(YMM(8) , YMM(8), YMM(8))
+    VXORPD(YMM(9) , YMM(9), YMM(9))
+    VXORPD(YMM(10), YMM(10), YMM(10)) //clear out registers
+    VXORPD(YMM(11), YMM(11), YMM(11)) //clear out registers
+    VMOVAPD(YMM(12), YMM(8))
+    VMOVAPD(YMM(13), YMM(8))
+    VMOVAPD(YMM(14), YMM(8))
+    VMOVAPD(YMM(15), YMM(8))
+    VXORPD(YMM(16), YMM(16), YMM(16)) //clear out registers
+    VXORPD(YMM(17), YMM(17), YMM(17)) //clear out registers
+    VMOVAPD(YMM(18), YMM(8))
+    VMOVAPD(YMM(19), YMM(8))
+    VMOVAPD(YMM(20), YMM(8))
+    VMOVAPD(YMM(21), YMM(8))
+    VXORPD(YMM(22), YMM(22), YMM(22)) //clear out registers
+    VXORPD(YMM(23), YMM(23), YMM(23)) //clear out registers
+    VMOVAPD(YMM(24), YMM(8))
+    VMOVAPD(YMM(25), YMM(8))
+    VMOVAPD(YMM(26), YMM(8))
+    VMOVAPD(YMM(27), YMM(8))
+    VXORPD(YMM(28), YMM(28), YMM(28)) //clear out registers
+    VXORPD(YMM(29), YMM(29), YMM(29)) //clear out registers
+    VMOVAPD(YMM(30), YMM(8))
+    VMOVAPD(YMM(31), YMM(8))
+
+    MOV(RSI, VAR(k)) //loop index
+    MOV(RAX, VAR(a10)) //load address of a
+    MOV(RBX, VAR(b01)) //load address of b
+    MOV(RCX, VAR(b11)) //load address of c
+    MOV(R15, VAR(c11))
+
+    LEA(R9, MEM(R15,63)) // c for prefetching
+
+    VMOVAPD(ZMM(0), MEM(RBX, 0*8)) //pre-load b
+    VMOVAPD(ZMM(1), MEM(RBX, 8*8)) //pre-load b
+    VMOVAPD(ZMM(2), MEM(RBX,16*8)) //pre-load b
+
+    LEA(RBX, MEM(RBX,24*8)) //adjust b for pre-load
+
+    MOV(R12, VAR(rs_c))
+    MOV(R10, VAR(cs_c))
+
+    MOV(R11, IMM(8))  // prefetch loop count
+                      // r11 = NR for row store
+                      // r11 = MR for col store
+    MOV(R8, R12)      // prefetch loop increment
+                      // r8 = cs_c for row store
+                      // r8 = rs_c for col store
+    MOV(R13, IMM(64)) // r13 = 0  for row store
+                      // r13 = 64 for col store
+    CMP(R10, IMM(8))  // jmp if c row stor
+    JZ(POST_STRIDE)
+        MOV(R8 , R10) // r8 = cs_c  -  prefetch loop increment
+        MOV(R11, IMM(24)) // r11 = 24  -  prefetch loop count
+        MOV(R13, IMM(0)) // r13 = 0
+
+    LABEL(POST_STRIDE)
+
+    MOV(RDI, RSI) // RDI = k
+    AND(RSI, IMM(3)) // RSI = k & 3, RSI = k % 4
+    SAR(RDI, IMM(2)) // RSI = k >> 2, RSI = k / 4
+
+    SUB(RDI, R11) // subtract prefetch loop count
+    SUB(RDI, IMM(0+TAIL_NITER)) // '0+' needed for preprocessor
+    JLE(K_LE_80)
+
+        LOOP_ALIGN
+        LABEL(LOOP1)
+
+            SUBITER(0)
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+
+        JNZ(LOOP1)
+
+    LABEL(K_LE_80)
+
+    ADD(RDI, R11) // add prefetch loop count
+    JLE(K_LE_26)
+
+        LOOP_ALIGN
+        LABEL(LOOP2)
+
+            PREFETCH(0, MEM(R9))
+            SUBITER(0)
+            PREFETCH(0, MEM(R9,R13, 1)) // prefetch R9+64 if col store,
+                                        // prefetch R9+0 if row store
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            PREFETCH(0, MEM(R9,R13, 2)) // prefetch R9+128 if col store,
+                                        // prefetch R9+0 if row store
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+            LEA(R9, MEM(R9,R8,1))       // r9 += rs_c if col store,
+                                        // r9 += cs_c if row store
+
+        JNZ(LOOP2)
+
+    LABEL(K_LE_26)
+
+    ADD(RDI, IMM(0+TAIL_NITER))
+    JLE(TAIL)
+
+        LOOP_ALIGN
+        LABEL(LOOP3)
+
+            SUBITER(0)
+            SUBITER(1)
+            SUB(RDI, IMM(1))
+            SUBITER(2)
+            SUBITER(3)
+
+            LEA(RAX, MEM(RAX,4*8*8))
+            LEA(RBX, MEM(RBX,4*24*8))
+
+        JNZ(LOOP3)
+
+    LABEL(TAIL)
+
+    TEST(RSI, RSI)
+    JZ(POSTACCUM)
+
+        LOOP_ALIGN
+        LABEL(TAIL_LOOP)
+
+            SUB(RSI, IMM(1))
+            SUBITER(0)
+
+            LEA(RAX, MEM(RAX,8*8))
+            LEA(RBX, MEM(RBX,24*8))
+
+        JNZ(TAIL_LOOP)
+
+    LABEL(POSTACCUM)
+
+    MOV(RBX, VAR(alpha))
+    VBROADCASTSD(ZMM(3), MEM(RBX))
+    MOV(RSI, IMM(1*8))
+
+    LEA(RAX, MEM(RCX, RSI, 8))
+    LEA(RDX, MEM(RAX, RSI, 8))
+
+    MOV(R13, RCX)
+    MOV(R14, RAX)
+    MOV(R15, RDX)
+
+// #region - TRSM
+    MOV(RDI, IMM(24 * 8))
+
+    VFMSUB231PD(ZMM(8 ), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(9 ), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(10), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(11), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(12), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(13), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(14), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(15), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(16), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(17), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(18), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(19), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(20), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(21), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(22), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(23), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(24), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(25), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(26), ZMM(3), MEM(RCX))
+    ADD(RCX, RDI)
+    VFMSUB231PD(ZMM(27), ZMM(3), MEM(RAX))
+    ADD(RAX, RDI)
+    VFMSUB231PD(ZMM(28), ZMM(3), MEM(RDX))
+    ADD(RDX, RDI)
+
+    VFMSUB231PD(ZMM(29), ZMM(3), MEM(RCX))
+    VFMSUB231PD(ZMM(30), ZMM(3), MEM(RAX))
+    VFMSUB231PD(ZMM(31), ZMM(3), MEM(RDX))
+
+    LEA(RAX, MEM(RDI, RDI, 2))
+    LEA(RAX, MEM(RAX, RDI, 4))
+    ADD(R13, RAX)
+    ADD(R14, RAX)
+    ADD(R15, RAX)
+
+    MOV(RAX, VAR(a11))
+    //iteration 0 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (7+7*8)*8))
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(29), ZMM(29), ZMM(0))
+        VMULPD(ZMM(30), ZMM(30), ZMM(0))
+        VMULPD(ZMM(31), ZMM(31), ZMM(0))
+    #else
+        VDIVPD(ZMM(29), ZMM(29), ZMM(0))
+        VDIVPD(ZMM(30), ZMM(30), ZMM(0))
+        VDIVPD(ZMM(31), ZMM(31), ZMM(0))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(29))
+    SUB(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(30))
+    SUB(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(31))
+    SUB(R15, RDI)
+
+    //iteration 1 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (6+7*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (6+6*8)*8))
+
+    VMULPD(ZMM(2), ZMM(29), ZMM(0))
+    VMULPD(ZMM(3), ZMM(30), ZMM(0))
+    VMULPD(ZMM(4), ZMM(31), ZMM(0))
+
+    VSUBPD(ZMM(26), ZMM(26), ZMM(2))
+    VSUBPD(ZMM(27), ZMM(27), ZMM(3))
+    VSUBPD(ZMM(28), ZMM(28), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(26), ZMM(26), ZMM(1))
+        VMULPD(ZMM(27), ZMM(27), ZMM(1))
+        VMULPD(ZMM(28), ZMM(28), ZMM(1))
+    #else
+        VDIVPD(ZMM(26), ZMM(26), ZMM(1))
+        VDIVPD(ZMM(27), ZMM(27), ZMM(1))
+        VDIVPD(ZMM(28), ZMM(28), ZMM(1))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(26))
+    SUB(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(27))
+    SUB(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(28))
+    SUB(R15, RDI)
+
+    //iteration 2 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (5+7*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (5+6*8)*8))
+
+    VMULPD(ZMM(2), ZMM(29), ZMM(0))
+    VMULPD(ZMM(3), ZMM(30), ZMM(0))
+    VMULPD(ZMM(4), ZMM(31), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (5+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(26), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(27), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(28), ZMM(1))
+
+    VSUBPD(ZMM(23), ZMM(23), ZMM(2))
+    VSUBPD(ZMM(24), ZMM(24), ZMM(3))
+    VSUBPD(ZMM(25), ZMM(25), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(23), ZMM(23), ZMM(0))
+        VMULPD(ZMM(24), ZMM(24), ZMM(0))
+        VMULPD(ZMM(25), ZMM(25), ZMM(0))
+    #else
+        VDIVPD(ZMM(23), ZMM(23), ZMM(0))
+        VDIVPD(ZMM(24), ZMM(24), ZMM(0))
+        VDIVPD(ZMM(25), ZMM(25), ZMM(0))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(23))
+    SUB(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(24))
+    SUB(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(25))
+    SUB(R15, RDI)
+
+    //iteration 3 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (4+7*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (4+6*8)*8))
+
+    VMULPD(ZMM(2), ZMM(29), ZMM(0))
+    VMULPD(ZMM(3), ZMM(30), ZMM(0))
+    VMULPD(ZMM(4), ZMM(31), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (4+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(26), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(27), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(28), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (4+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(23), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(24), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(25), ZMM(0))
+
+    VSUBPD(ZMM(20), ZMM(20), ZMM(2))
+    VSUBPD(ZMM(21), ZMM(21), ZMM(3))
+    VSUBPD(ZMM(22), ZMM(22), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(20), ZMM(20), ZMM(1))
+        VMULPD(ZMM(21), ZMM(21), ZMM(1))
+        VMULPD(ZMM(22), ZMM(22), ZMM(1))
+    #else
+        VDIVPD(ZMM(20), ZMM(20), ZMM(1))
+        VDIVPD(ZMM(21), ZMM(21), ZMM(1))
+        VDIVPD(ZMM(22), ZMM(22), ZMM(1))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(20))
+    SUB(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(21))
+    SUB(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(22))
+    SUB(R15, RDI)
+
+    //iteration 4 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (3+7*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (3+6*8)*8))
+
+    VMULPD(ZMM(2), ZMM(29), ZMM(0))
+    VMULPD(ZMM(3), ZMM(30), ZMM(0))
+    VMULPD(ZMM(4), ZMM(31), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (3+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(26), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(27), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(28), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (3+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(23), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(24), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(25), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (3+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(20), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(21), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(22), ZMM(1))
+
+    VSUBPD(ZMM(17), ZMM(17), ZMM(2))
+    VSUBPD(ZMM(18), ZMM(18), ZMM(3))
+    VSUBPD(ZMM(19), ZMM(19), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(17), ZMM(17), ZMM(0))
+        VMULPD(ZMM(18), ZMM(18), ZMM(0))
+        VMULPD(ZMM(19), ZMM(19), ZMM(0))
+    #else
+        VDIVPD(ZMM(17), ZMM(17), ZMM(0))
+        VDIVPD(ZMM(18), ZMM(18), ZMM(0))
+        VDIVPD(ZMM(19), ZMM(19), ZMM(0))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(17))
+    SUB(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(18))
+    SUB(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(19))
+    SUB(R15, RDI)
+
+    //iteration 5 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (2+7*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (2+6*8)*8))
+
+    VMULPD(ZMM(2), ZMM(29), ZMM(0))
+    VMULPD(ZMM(3), ZMM(30), ZMM(0))
+    VMULPD(ZMM(4), ZMM(31), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (2+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(26), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(27), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(28), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (2+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(23), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(24), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(25), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (2+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(20), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(21), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(22), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (2+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(17), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(18), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(19), ZMM(0))
+
+    VSUBPD(ZMM(14), ZMM(14), ZMM(2))
+    VSUBPD(ZMM(15), ZMM(15), ZMM(3))
+    VSUBPD(ZMM(16), ZMM(16), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(14), ZMM(14), ZMM(1))
+        VMULPD(ZMM(15), ZMM(15), ZMM(1))
+        VMULPD(ZMM(16), ZMM(16), ZMM(1))
+    #else
+        VDIVPD(ZMM(14), ZMM(14), ZMM(1))
+        VDIVPD(ZMM(15), ZMM(15), ZMM(1))
+        VDIVPD(ZMM(16), ZMM(16), ZMM(1))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(14))
+    SUB(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(15))
+    SUB(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(16))
+    SUB(R15, RDI)
+
+    //iteration 6 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (1+7*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (1+6*8)*8))
+
+    VMULPD(ZMM(2), ZMM(29), ZMM(0))
+    VMULPD(ZMM(3), ZMM(30), ZMM(0))
+    VMULPD(ZMM(4), ZMM(31), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (1+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(26), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(27), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(28), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (1+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(23), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(24), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(25), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (1+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(20), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(21), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(22), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (1+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(17), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(18), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(19), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (1+1*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(14), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(15), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(16), ZMM(1))
+
+    VSUBPD(ZMM(11), ZMM(11), ZMM(2))
+    VSUBPD(ZMM(12), ZMM(12), ZMM(3))
+    VSUBPD(ZMM(13), ZMM(13), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM(11), ZMM(11), ZMM(0))
+        VMULPD(ZMM(12), ZMM(12), ZMM(0))
+        VMULPD(ZMM(13), ZMM(13), ZMM(0))
+    #else
+        VDIVPD(ZMM(11), ZMM(11), ZMM(0))
+        VDIVPD(ZMM(12), ZMM(12), ZMM(0))
+        VDIVPD(ZMM(13), ZMM(13), ZMM(0))
+    #endif
+    VMOVUPD(MEM(R13), ZMM(11))
+    SUB(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM(12))
+    SUB(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(13))
+    SUB(R15, RDI)
+
+    //iteration 7 --------------------------------------------
+    VBROADCASTSD(ZMM(0), MEM(RAX, (0+7*8)*8))
+    VBROADCASTSD(ZMM(1), MEM(RAX, (0+6*8)*8))
+
+    VMULPD(ZMM(2), ZMM(29), ZMM(0))
+    VMULPD(ZMM(3), ZMM(30), ZMM(0))
+    VMULPD(ZMM(4), ZMM(31), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (0+5*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(26), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(27), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(28), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (0+4*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(23), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(24), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(25), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (0+3*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(20), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(21), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(22), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (0+2*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(17), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(18), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(19), ZMM(0))
+
+    VBROADCASTSD(ZMM(0), MEM(RAX, (0+1*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(14), ZMM(1))
+    VFMADD231PD(ZMM(3), ZMM(15), ZMM(1))
+    VFMADD231PD(ZMM(4), ZMM(16), ZMM(1))
+
+    VBROADCASTSD(ZMM(1), MEM(RAX, (0+0*8)*8))
+    VFMADD231PD(ZMM(2), ZMM(11), ZMM(0))
+    VFMADD231PD(ZMM(3), ZMM(12), ZMM(0))
+    VFMADD231PD(ZMM(4), ZMM(13), ZMM(0))
+
+    VSUBPD(ZMM( 8), ZMM( 8), ZMM(2))
+    VSUBPD(ZMM( 9), ZMM( 9), ZMM(3))
+    VSUBPD(ZMM(10), ZMM(10), ZMM(4))
+
+    #ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        VMULPD(ZMM( 8), ZMM( 8), ZMM(1))
+        VMULPD(ZMM( 9), ZMM( 9), ZMM(1))
+        VMULPD(ZMM(10), ZMM(10), ZMM(1))
+    #else
+        VDIVPD(ZMM( 8), ZMM( 8), ZMM(1))
+        VDIVPD(ZMM( 9), ZMM( 9), ZMM(1))
+        VDIVPD(ZMM(10), ZMM(10), ZMM(1))
+    #endif
+    VMOVUPD(MEM(R13), ZMM( 8))
+    SUB(R13, RDI)
+    VMOVUPD(MEM(R14), ZMM( 9))
+    SUB(R14, RDI)
+    VMOVUPD(MEM(R15), ZMM(10))
+    SUB(R15, RDI)
+
+// #endregion - trsm
+
+    MOV(RAX, R12)
+    MOV(RBX, R10)
+    MOV(RCX, VAR(c11))
+
+    CMP(RAX, IMM(8))
+    JE(COLUPDATE)
+
+    CMP(RBX, IMM(8))
+    JE(ROWUPDATE)
+
+    LABEL(SCATTERUPDATE)
+        MOV(RDI, VAR(offsetPtr))
+        VPBROADCASTQ(ZMM(0), R10)
+        VPMULLQ(ZMM(2), ZMM(0), MEM(RDI))
+        VPMULLQ(ZMM(3), ZMM(0), MEM(RDI, 8*8))
+        VPMULLQ(ZMM(4), ZMM(0), MEM(RDI,16*8))
+        UPDATE_C_SCATTERED( 8,  9, 10)
+        UPDATE_C_SCATTERED(11, 12, 13)
+        UPDATE_C_SCATTERED(14, 15, 16)
+        UPDATE_C_SCATTERED(17, 18, 19)
+        UPDATE_C_SCATTERED(20, 21, 22)
+        UPDATE_C_SCATTERED(23, 24, 25)
+        UPDATE_C_SCATTERED(26, 27, 28)
+        UPDATE_C_SCATTERED(29, 30, 31)
+        JMP(END)
+    LABEL(ROWUPDATE)
+        LEA(R11, MEM(RAX, RAX, 2)) //R11 = rs_c * 3, R11 = rs_c + rs_c * 2
+        LEA(R12, MEM(RAX, RAX, 4)) //R12 = rs_c * 5, R12 = rs_c + rs_c * 4
+        LEA(R13, MEM(RAX, R11, 2)) //R13 = rs_c * 7, R13 = rs_c + R11 * 2
+
+        // ROW0
+        VMOVUPD(MEM(RCX     ), ZMM( 8))
+        VMOVUPD(MEM(RCX, 64 ), ZMM( 9))
+        VMOVUPD(MEM(RCX, 128), ZMM(10))
+
+        // ROW1
+        VMOVUPD(MEM(RCX, RAX, 1,    ), ZMM(11))
+        VMOVUPD(MEM(RCX, RAX, 1, 64 ), ZMM(12))
+        VMOVUPD(MEM(RCX, RAX, 1, 128), ZMM(13))
+
+        // ROW2
+        VMOVUPD(MEM(RCX, RAX, 2,    ), ZMM(14))
+        VMOVUPD(MEM(RCX, RAX, 2, 64 ), ZMM(15))
+        VMOVUPD(MEM(RCX, RAX, 2, 128), ZMM(16))
+
+        // ROW3
+        VMOVUPD(MEM(RCX, R11, 1,    ), ZMM(17))
+        VMOVUPD(MEM(RCX, R11, 1, 64 ), ZMM(18))
+        VMOVUPD(MEM(RCX, R11, 1, 128), ZMM(19))
+
+        // ROW4
+        VMOVUPD(MEM(RCX, RAX, 4,    ), ZMM(20))
+        VMOVUPD(MEM(RCX, RAX, 4, 64 ), ZMM(21))
+        VMOVUPD(MEM(RCX, RAX, 4, 128), ZMM(22))
+
+        // ROW5
+        VMOVUPD(MEM(RCX, R12, 1,    ), ZMM(23))
+        VMOVUPD(MEM(RCX, R12, 1, 64 ), ZMM(24))
+        VMOVUPD(MEM(RCX, R12, 1, 128), ZMM(25))
+
+        // ROW6
+        VMOVUPD(MEM(RCX, R11, 2,    ), ZMM(26))
+        VMOVUPD(MEM(RCX, R11, 2, 64 ), ZMM(27))
+        VMOVUPD(MEM(RCX, R11, 2, 128), ZMM(28))
+
+        // ROW7
+        VMOVUPD(MEM(RCX, R13, 1,    ), ZMM(29))
+        VMOVUPD(MEM(RCX, R13, 1, 64 ), ZMM(30))
+        VMOVUPD(MEM(RCX, R13, 1, 128), ZMM(31))
+        JMP(END)
+
+    LABEL(COLUPDATE)
+        LEA(R11, MEM(R10, R10, 2)) //R11 = cs_c * 3
+        LEA(R12, MEM(R10, R10, 4)) //R12 = cs_c * 5
+        LEA(R13, MEM(R10, R11, 2)) //R13 = cs_c * 7
+        TRANSPOSE_8X8( 8, 11, 14, 17, 20, 23, 26, 29)
+        TRANSPOSE_8X8( 9, 12, 15, 18, 21, 24, 27, 30)
+        TRANSPOSE_8X8(10, 13, 16, 19, 22, 25, 28, 31)
+        UPDATE_C_COL_STORE( 8, 11, 14, 17, 20, 23, 26, 29)
+        UPDATE_C_COL_STORE( 9, 12, 15, 18, 21, 24, 27, 30)
+        UPDATE_C_COL_STORE(10, 13, 16, 19, 22, 25, 28, 31)
+
+    LABEL(END)
+
+    VZEROUPPER()
+
+    END_ASM(
+        : // output operands (none)
+        : // input operands
+        [a10]       "m" (a10),
+        [k]         "m" (k),
+        [b01]       "m" (b01),
+        [a11]       "m" (a11),
+        [b11]       "m" (b11),
+        [c11]       "m" (c11),
+        [rs_c]      "m" (rs_c),
+        [cs_c]      "m" (cs_c),
+        [alpha]     "m" (alpha),
+        [offsetPtr] "m" (offsetPtr)
+        : // register clobber list
+          "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
+          "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+          "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
+          "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
+          "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
+          "zmm30", "zmm31", "memory"
+    )
+}
diff --git a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c
index 787f85155c..401c6e7d23 100644
--- a/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c
+++ b/kernels/zen4/3/bli_gemmtrsm_u_zen_16x14.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -1696,11 +1696,15 @@ void bli_dgemmtrsm_u_zen_asm_16x14
         : // register clobber list
           "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
           "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
+          "xmm0", "xmm1",
+          "ymm0", "ymm1", "ymm4", "ymm6", "ymm20", "ymm21", "ymm22", "ymm23",
+          "ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31",
           "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
           "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
           "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
-          "zmm30", "zmm31", "memory"
+          "zmm30", "zmm31",
+          "k0", "k1", "k2", "k3", "k4", "memory"
         )
 
         AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_9);
-}
\ No newline at end of file
+}
diff --git a/kernels/zen4/3/bli_trsm_small_AVX512.c b/kernels/zen4/3/bli_trsm_small_AVX512.c
new file mode 100644
index 0000000000..82431bd6a2
--- /dev/null
+++ b/kernels/zen4/3/bli_trsm_small_AVX512.c
@@ -0,0 +1,11014 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+  - Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  - Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  - Neither the name(s) of the copyright holder(s) nor the names of its
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "blis.h"
+#include "bli_trsm_small_ref.h"
+
+#ifdef BLIS_ENABLE_SMALL_MATRIX_TRSM
+#include "immintrin.h"
+#define BLIS_ENABLE_PREFETCH_IN_TRSM_SMALL
+
+
+
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+  #define DTRSM_SMALL_DIV_OR_SCALE _mm256_div_pd
+  #define DTRSM_SMALL_DIV_OR_SCALE_AVX512 _mm512_div_pd
+#endif
+
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+  #define DTRSM_SMALL_DIV_OR_SCALE _mm256_mul_pd
+  #define DTRSM_SMALL_DIV_OR_SCALE_AVX512 _mm512_mul_pd
+#endif
+
+#define BLIS_SET_YMM_REG_ZEROS_AVX512 \
+  ymm0  = _mm256_setzero_pd(); \
+  ymm1  = _mm256_setzero_pd(); \
+  ymm2  = _mm256_setzero_pd(); \
+  ymm3  = _mm256_setzero_pd(); \
+  ymm4  = _mm256_setzero_pd(); \
+  ymm5  = _mm256_setzero_pd(); \
+  ymm6  = _mm256_setzero_pd(); \
+  ymm7  = _mm256_setzero_pd(); \
+  ymm8  = _mm256_setzero_pd(); \
+  ymm9  = _mm256_setzero_pd(); \
+  ymm10 = _mm256_setzero_pd(); \
+  ymm11 = _mm256_setzero_pd(); \
+  ymm12 = _mm256_setzero_pd(); \
+  ymm13 = _mm256_setzero_pd(); \
+  ymm14 = _mm256_setzero_pd(); \
+  ymm15 = _mm256_setzero_pd(); \
+  ymm16 = _mm256_setzero_pd(); \
+  ymm17 = _mm256_setzero_pd(); \
+  ymm18 = _mm256_setzero_pd(); \
+  ymm19 = _mm256_setzero_pd(); \
+  ymm20 = _mm256_setzero_pd(); \
+  ymm21 = _mm256_setzero_pd(); \
+  ymm22 = _mm256_setzero_pd(); \
+  ymm23 = _mm256_setzero_pd(); \
+  ymm24 = _mm256_setzero_pd(); \
+  ymm25 = _mm256_setzero_pd(); \
+  ymm26 = _mm256_setzero_pd(); \
+  ymm27 = _mm256_setzero_pd(); \
+  ymm28 = _mm256_setzero_pd(); \
+  ymm29 = _mm256_setzero_pd(); \
+  ymm30 = _mm256_setzero_pd(); \
+  ymm31 = _mm256_setzero_pd();
+
+#define BLIS_SET_YMM_REG_ZEROS_FOR_LEFT \
+  ymm8  = _mm256_setzero_pd(); \
+  ymm9  = _mm256_setzero_pd(); \
+  ymm10 = _mm256_setzero_pd(); \
+  ymm11 = _mm256_setzero_pd(); \
+  ymm12 = _mm256_setzero_pd(); \
+  ymm13 = _mm256_setzero_pd(); \
+  ymm14 = _mm256_setzero_pd(); \
+  ymm15 = _mm256_setzero_pd(); \
+  ymm16 = _mm256_setzero_pd(); \
+
+#define BLIS_SET_ZMM_REG_ZEROS \
+  zmm0 = _mm512_setzero_pd(); \
+  zmm1 = _mm512_setzero_pd(); \
+  zmm2 = _mm512_setzero_pd(); \
+  zmm3 = _mm512_setzero_pd(); \
+  zmm4 = _mm512_setzero_pd(); \
+  zmm5 = _mm512_setzero_pd(); \
+  zmm6 = _mm512_setzero_pd(); \
+  zmm7 = _mm512_setzero_pd(); \
+  zmm8 = _mm512_setzero_pd(); \
+  zmm9 = _mm512_setzero_pd(); \
+  zmm10 = _mm512_setzero_pd(); \
+  zmm11 = _mm512_setzero_pd(); \
+  zmm12 = _mm512_setzero_pd(); \
+  zmm13 = _mm512_setzero_pd(); \
+  zmm14 = _mm512_setzero_pd(); \
+  zmm15 = _mm512_setzero_pd(); \
+  zmm16 = _mm512_setzero_pd(); \
+  zmm17 = _mm512_setzero_pd(); \
+  zmm18 = _mm512_setzero_pd(); \
+  zmm19 = _mm512_setzero_pd(); \
+  zmm20 = _mm512_setzero_pd(); \
+  zmm21 = _mm512_setzero_pd(); \
+  zmm22 = _mm512_setzero_pd(); \
+  zmm23 = _mm512_setzero_pd(); \
+  zmm24 = _mm512_setzero_pd(); \
+  zmm25 = _mm512_setzero_pd(); \
+  zmm26 = _mm512_setzero_pd(); \
+  zmm27 = _mm512_setzero_pd(); \
+  zmm28 = _mm512_setzero_pd(); \
+  zmm29 = _mm512_setzero_pd(); \
+  zmm30 = _mm512_setzero_pd(); \
+  zmm31 = _mm512_setzero_pd();
+
+#define BLIS_SET_YMM_REG_ZEROS_FOR_N_REM \
+  ymm3 = _mm256_setzero_pd(); \
+  ymm4 = _mm256_setzero_pd(); \
+  ymm5 = _mm256_setzero_pd(); \
+  ymm6 = _mm256_setzero_pd(); \
+  ymm7 = _mm256_setzero_pd(); \
+  ymm8 = _mm256_setzero_pd(); \
+  ymm9 = _mm256_setzero_pd(); \
+  ymm10 = _mm256_setzero_pd(); \
+  ymm15 = _mm256_setzero_pd(); \
+
+/*
+   declaration of trsm small kernels function pointer
+*/
+typedef err_t (*trsmsmall_ker_ft)
+     (
+       obj_t*   AlphaObj,
+       obj_t*   a,
+       obj_t*   b,
+       cntx_t*  cntx,
+       cntl_t*  cntl
+     );
+
+
+/*
+  Pack a block of 8xk from input buffer into packed buffer
+  directly or after transpose based on input params
+*/
+BLIS_INLINE void bli_dtrsm_small_pack_avx512
+     (
+       char     side,
+       dim_t    size,
+       bool     trans,
+       double*  inbuf,
+       dim_t    cs_a,
+       double*  pbuff,
+       dim_t    p_lda,
+       dim_t    mr
+     )
+{
+  // scratch registers
+  __m512d zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+  if (side == 'L' || side == 'l')
+  {
+    /*Left case is 8xk*/
+    if (trans)
+    {
+      __m256d ymm0, ymm1, ymm2, ymm3;
+      __m256d ymm4, ymm5, ymm6, ymm7;
+      __m256d ymm8, ymm9, ymm10, ymm11;
+      __m256d ymm12, ymm13;
+      for (dim_t x = 0; x < size; x += mr)
+      {
+        ymm0 = _mm256_loadu_pd((double const *)(inbuf));
+        ymm10 = _mm256_loadu_pd((double const *)(inbuf + 4));
+        ymm1 = _mm256_loadu_pd((double const *)(inbuf + cs_a));
+        ymm11 = _mm256_loadu_pd((double const *)(inbuf + 4 + cs_a));
+        ymm2 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 2));
+        ymm12 = _mm256_loadu_pd((double const *)(inbuf + 4 + cs_a * 2));
+        ymm3 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 3));
+        ymm13 = _mm256_loadu_pd((double const *)(inbuf + 4 + cs_a * 3));
+
+        ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+        ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+        ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+        ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        _mm256_storeu_pd((double *)(pbuff), ymm6);
+        _mm256_storeu_pd((double *)(pbuff + p_lda), ymm7);
+        _mm256_storeu_pd((double *)(pbuff + p_lda * 2), ymm8);
+        _mm256_storeu_pd((double *)(pbuff + p_lda * 3), ymm9);
+
+        ymm4 = _mm256_unpacklo_pd(ymm10, ymm11);
+        ymm5 = _mm256_unpacklo_pd(ymm12, ymm13);
+
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+        ymm0 = _mm256_unpackhi_pd(ymm10, ymm11);
+        ymm1 = _mm256_unpackhi_pd(ymm12, ymm13);
+
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        _mm256_storeu_pd((double *)(pbuff + p_lda * 4), ymm6);
+        _mm256_storeu_pd((double *)(pbuff + p_lda * 5), ymm7);
+        _mm256_storeu_pd((double *)(pbuff + p_lda * 6), ymm8);
+        _mm256_storeu_pd((double *)(pbuff + p_lda * 7), ymm9);
+
+        ymm0 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 4));
+        ymm10 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 4 + 4));
+        ymm1 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 5));
+        ymm11 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 5 + 4));
+        ymm2 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 6));
+        ymm12 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 6 + 4));
+        ymm3 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 7));
+        ymm13 = _mm256_loadu_pd((double const *)(inbuf + cs_a * 7 + 4));
+
+        ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+        ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+        ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+        ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        _mm256_storeu_pd((double *)(pbuff + 4), ymm6);
+        _mm256_storeu_pd((double *)(pbuff + 4 + p_lda), ymm7);
+        _mm256_storeu_pd((double *)(pbuff + 4 + p_lda * 2), ymm8);
+        _mm256_storeu_pd((double *)(pbuff + 4 + p_lda * 3), ymm9);
+
+        ymm4 = _mm256_unpacklo_pd(ymm10, ymm11);
+        ymm5 = _mm256_unpacklo_pd(ymm12, ymm13);
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+        ymm0 = _mm256_unpackhi_pd(ymm10, ymm11);
+        ymm1 = _mm256_unpackhi_pd(ymm12, ymm13);
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        _mm256_storeu_pd((double *)(pbuff + 4 + p_lda * 4), ymm6);
+        _mm256_storeu_pd((double *)(pbuff + 4 + p_lda * 5), ymm7);
+        _mm256_storeu_pd((double *)(pbuff + 4 + p_lda * 6), ymm8);
+        _mm256_storeu_pd((double *)(pbuff + 4 + p_lda * 7), ymm9);
+
+        inbuf += mr;
+        pbuff += mr * mr;
+      }
+    }
+    else
+        for (dim_t x = 0; x < size; x++)
+        {
+          zmm0 = _mm512_loadu_pd((double const *)(inbuf));
+          _mm512_storeu_pd((double *)(pbuff), zmm0);
+          inbuf += cs_a;
+          pbuff += p_lda;
+    }
+  }
+  else if (side == 'R' || side == 'r')
+  {
+    if (trans)
+    {
+      /*
+          ----------------   -------------
+          |           |      |     |     |
+          |    4x8    |      |     |     |
+          -------------  ==> | 8x4 | 8x4 |
+          |    4x8    |      |     |     |
+          |           |      |     |     |
+          ----------------   -------------
+      */
+      __m256d ymm0, ymm1, ymm2, ymm3;
+      __m256d ymm4, ymm5, ymm6, ymm7;
+      __m256d ymm8, ymm9, ymm10, ymm11;
+      __m256d ymm12, ymm13;
+      for (dim_t x = 0; x < p_lda; x += mr)
+      {
+        // load 4x8
+        ymm0 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 0)));
+        ymm1 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 1)));
+        ymm2 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 2)));
+        ymm3 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 3)));
+        ymm10 = _mm256_loadu_pd((double const *)(inbuf + 4 + (cs_a * 0)));
+        ymm11 = _mm256_loadu_pd((double const *)(inbuf + 4 + (cs_a * 1)));
+        ymm12 = _mm256_loadu_pd((double const *)(inbuf + 4 + (cs_a * 2)));
+        ymm13 = _mm256_loadu_pd((double const *)(inbuf + 4 + (cs_a * 3)));
+
+        // transpose 4x4
+        ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+        ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+        ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+        ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        // store 4x4
+        _mm256_storeu_pd((double *)(pbuff + (p_lda * 0)), ymm6);
+        _mm256_storeu_pd((double *)(pbuff + (p_lda * 1)), ymm7);
+        _mm256_storeu_pd((double *)(pbuff + (p_lda * 2)), ymm8);
+        _mm256_storeu_pd((double *)(pbuff + (p_lda * 3)), ymm9);
+
+        // transpose 4x4
+        ymm4 = _mm256_unpacklo_pd(ymm10, ymm11);
+        ymm5 = _mm256_unpacklo_pd(ymm12, ymm13);
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+        ymm0 = _mm256_unpackhi_pd(ymm10, ymm11);
+        ymm1 = _mm256_unpackhi_pd(ymm12, ymm13);
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        // store 4x4
+        _mm256_storeu_pd((double *)(pbuff + (p_lda * 4)), ymm6);
+        _mm256_storeu_pd((double *)(pbuff + (p_lda * 5)), ymm7);
+        _mm256_storeu_pd((double *)(pbuff + (p_lda * 6)), ymm8);
+        _mm256_storeu_pd((double *)(pbuff + (p_lda * 7)), ymm9);
+
+        // load 4x8
+        ymm0 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 4)));
+        ymm1 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 5)));
+        ymm2 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 6)));
+        ymm3 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 7)));
+        ymm10 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 4) + 4));
+        ymm11 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 5) + 4));
+        ymm12 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 6) + 4));
+        ymm13 = _mm256_loadu_pd((double const *)(inbuf + (cs_a * 7) + 4));
+
+        // transpose 4x4
+        ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+        ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+        ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+        ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        // store 4x4
+        _mm256_storeu_pd((double *)(pbuff + 4 + (p_lda * 0)), ymm6);
+        _mm256_storeu_pd((double *)(pbuff + 4 + (p_lda * 1)), ymm7);
+        _mm256_storeu_pd((double *)(pbuff + 4 + (p_lda * 2)), ymm8);
+        _mm256_storeu_pd((double *)(pbuff + 4 + (p_lda * 3)), ymm9);
+
+        // transpose 4x4
+        ymm4 = _mm256_unpacklo_pd(ymm10, ymm11);
+        ymm5 = _mm256_unpacklo_pd(ymm12, ymm13);
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+        ymm0 = _mm256_unpackhi_pd(ymm10, ymm11);
+        ymm1 = _mm256_unpackhi_pd(ymm12, ymm13);
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        // store 4x4
+        _mm256_storeu_pd((double *)(pbuff + 4 + (p_lda * 4)), ymm6);
+        _mm256_storeu_pd((double *)(pbuff + 4 + (p_lda * 5)), ymm7);
+        _mm256_storeu_pd((double *)(pbuff + 4 + (p_lda * 6)), ymm8);
+        _mm256_storeu_pd((double *)(pbuff + 4 + (p_lda * 7)), ymm9);
+
+        inbuf += mr * cs_a;
+        pbuff += mr;
+      }
+    }
+    else
+    {
+      dim_t size_div_8 = size >> 3;
+      for (int i = 0; i < size_div_8; i++)
+      {
+        zmm0 = _mm512_loadu_pd((double const *)(inbuf + (cs_a * 0)));
+        _mm512_storeu_pd((double *)(pbuff + (p_lda * 0)), zmm0);
+        zmm1 = _mm512_loadu_pd((double const *)(inbuf + (cs_a * 1)));
+        _mm512_storeu_pd((double *)(pbuff + (p_lda * 1)), zmm1);
+        zmm2 = _mm512_loadu_pd((double const *)(inbuf + (cs_a * 2)));
+        _mm512_storeu_pd((double *)(pbuff + (p_lda * 2)), zmm2);
+        zmm3 = _mm512_loadu_pd((double const *)(inbuf + (cs_a * 3)));
+        _mm512_storeu_pd((double *)(pbuff + (p_lda * 3)), zmm3);
+        zmm4 = _mm512_loadu_pd((double const *)(inbuf + (cs_a * 4)));
+        _mm512_storeu_pd((double *)(pbuff + (p_lda * 4)), zmm4);
+        zmm5 = _mm512_loadu_pd((double const *)(inbuf + (cs_a * 5)));
+        _mm512_storeu_pd((double *)(pbuff + (p_lda * 5)), zmm5);
+        zmm6 = _mm512_loadu_pd((double const *)(inbuf + (cs_a * 6)));
+        _mm512_storeu_pd((double *)(pbuff + (p_lda * 6)), zmm6);
+        zmm7 = _mm512_loadu_pd((double const *)(inbuf + (cs_a * 7)));
+        _mm512_storeu_pd((double *)(pbuff + (p_lda * 7)), zmm7);
+        inbuf += 8;
+        pbuff += 8;
+      }
+    }
+  }
+}
+/*
+  Pack diagonal elements of A block (8) into an array
+  a. This helps in utilze cache line efficiently in TRSM operation
+  b. store ones when input is unit diagonal
+*/
+BLIS_INLINE void dtrsm_small_pack_diag_element_avx512
+     (
+       bool     is_unitdiag,
+       double*  a11,
+       dim_t    cs_a,
+       double*  d11_pack,
+       dim_t    size
+     )
+{
+  __m512d zmm0, zmm1, zmm2, zmm3;
+  __m512d zmm4, zmm5, zmm6, zmm7;
+  __m512d zmm8;
+  double ones = 1.0;
+  // if (size == 8)
+  {
+    zmm8 = _mm512_set1_pd(ones);
+    if (!is_unitdiag)
+    {
+      __m512d zmm10, zmm11, zmm12, zmm13;
+      __m512d zmm14, zmm15;
+      // broadcast diagonal elements of A11
+      zmm0 = _mm512_set1_pd(*(a11 + (cs_a * 0) + 0));
+      zmm1 = _mm512_set1_pd(*(a11 + (cs_a * 1) + 1));
+      zmm2 = _mm512_set1_pd(*(a11 + (cs_a * 2) + 2));
+      zmm3 = _mm512_set1_pd(*(a11 + (cs_a * 3) + 3));
+      zmm4 = _mm512_set1_pd(*(a11 + (cs_a * 4) + 4));
+      zmm5 = _mm512_set1_pd(*(a11 + (cs_a * 5) + 5));
+      zmm6 = _mm512_set1_pd(*(a11 + (cs_a * 6) + 6));
+      zmm7 = _mm512_set1_pd(*(a11 + (cs_a * 7) + 7));
+
+      //combine all elements of A11 into zmm1
+      // Stage 1
+      zmm10 = _mm512_unpacklo_pd(zmm0, zmm1);
+      zmm11 = _mm512_unpacklo_pd(zmm2, zmm3);
+      zmm12 = _mm512_unpacklo_pd(zmm4, zmm5);
+      zmm13 = _mm512_unpacklo_pd(zmm6, zmm7);
+      // Stage 2
+      zmm14 = _mm512_shuffle_f64x2(zmm10, zmm11, 0b10001000);
+      zmm15 = _mm512_shuffle_f64x2(zmm12, zmm13, 0b10001000);
+      // Stage 3
+      zmm1 = _mm512_shuffle_f64x2(zmm14, zmm15, 0b10001000);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+        zmm8 = zmm1;
+      #endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+        zmm8 = _mm512_div_pd(zmm8, zmm1);
+      #endif
+    }
+    _mm512_storeu_pd((double *)(d11_pack), zmm8);
+  }
+}
+/*
+ * Kernels Table
+ */
+trsmsmall_ker_ft ker_fps_AVX512[4][8] =
+  {
+    {NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL},
+    {NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL},
+    {bli_dtrsm_small_AutXB_AlXB_AVX512,
+     bli_dtrsm_small_AltXB_AuXB_AVX512,
+     bli_dtrsm_small_AltXB_AuXB_AVX512,
+     bli_dtrsm_small_AutXB_AlXB_AVX512,
+     bli_dtrsm_small_XAutB_XAlB_AVX512,
+     bli_dtrsm_small_XAltB_XAuB_AVX512,
+     bli_dtrsm_small_XAltB_XAuB_AVX512,
+     bli_dtrsm_small_XAutB_XAlB_AVX512},
+    {NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL,
+     NULL},
+};
+/*
+* The bli_trsm_small implements a version of TRSM where A is packed and reused
+*
+* Input:  A: MxM (triangular matrix)
+*     B: MxN matrix
+* Output: X: MxN matrix such that
+       AX = alpha*B or XA = alpha*B or A'X = alpha*B or XA' = alpha*B
+* Here the output X is stored in B
+*
+* Note: Currently only dtrsm is supported when A & B are column-major
+*/
+err_t bli_trsm_small_AVX512
+     (
+       side_t   side,
+       obj_t*   alpha,
+       obj_t*   a,
+       obj_t*   b,
+       cntx_t*  cntx,
+       cntl_t*  cntl,
+       bool     is_parallel
+     )
+{
+  err_t err;
+
+  bool uplo = bli_obj_is_upper(a);
+  bool transa = bli_obj_has_trans(a);
+  num_t dt = bli_obj_dt(a);
+
+  switch (dt)
+  {
+  case BLIS_DOUBLE:
+  {
+    break;
+  }
+  case BLIS_FLOAT:
+  case BLIS_SCOMPLEX:
+  case BLIS_DCOMPLEX:
+  default:
+  {
+    return BLIS_NOT_YET_IMPLEMENTED;
+    break;
+  }
+  }
+  /* If alpha is zero, B matrix will become zero after scaling
+     hence solution is also zero matrix */
+  if (bli_obj_equals(alpha, &BLIS_ZERO))
+  {
+    return BLIS_NOT_YET_IMPLEMENTED; // scale B by alpha
+  }
+
+  // Return if inputs are row major as currently
+  // we are supporing col major only
+  if ((bli_obj_row_stride(a) != 1) ||
+    (bli_obj_row_stride(b) != 1))
+  {
+    return BLIS_INVALID_ROW_STRIDE;
+  }
+
+  // A is expected to be triangular in trsm
+  if (!bli_obj_is_upper_or_lower(a))
+  {
+    return BLIS_EXPECTED_TRIANGULAR_OBJECT;
+  }
+  /*
+   *  Compose kernel index based on inputs
+   */
+  dim_t keridx = (((side & 0x1) << 2) |
+          ((uplo & 0x1) << 1) |
+          (transa & 0x1));
+  trsmsmall_ker_ft ker_fp = ker_fps_AVX512[dt][keridx];
+  /*Call the kernel*/
+  err = ker_fp(
+    alpha,
+    a,
+    b,
+    cntx,
+    cntl);
+  return err;
+};
+
+#ifdef BLIS_ENABLE_OPENMP
+/*
+ * Parallelized dtrsm_small across m-dimension or n-dimension based on side(Left/Right)
+ */
+err_t bli_trsm_small_mt_AVX512
+     (
+       side_t   side,
+       obj_t*   alpha,
+       obj_t*   a,
+       obj_t*   b,
+       cntx_t*  cntx,
+       cntl_t*  cntl,
+       bool     is_parallel
+     )
+{
+  gint_t m = bli_obj_length(b); // number of rows of matrix b
+  gint_t n = bli_obj_width(b);  // number of columns of Matrix b
+  dim_t d_mr = 8,d_nr = 8;
+
+  num_t dt = bli_obj_dt(a);
+  switch (dt)
+  {
+    case BLIS_DOUBLE:
+    {
+      d_mr = 8, d_nr = 8;
+      break;
+    }
+    default:
+    {
+      return BLIS_NOT_YET_IMPLEMENTED;
+      break;
+    }
+  }
+
+  rntm_t rntm;
+  bli_rntm_init_from_global(&rntm);
+
+#ifdef AOCL_DYNAMIC
+  // If dynamic-threading is enabled, calculate optimum number
+  //  of threads.
+  //  rntm will be updated with optimum number of threads.
+  if (bli_obj_is_double(b))
+  {
+    bli_nthreads_optimum(a, b, b, BLIS_TRSM, &rntm);
+  }
+#endif
+
+  // Query the total number of threads from the rntm_t object.
+  dim_t n_threads = bli_rntm_num_threads(&rntm);
+
+  if (n_threads < 0)
+    n_threads = 1;
+
+  err_t status = BLIS_SUCCESS;
+  _Pragma("omp parallel num_threads(n_threads)")
+  {
+    // Query the thread's id from OpenMP.
+    const dim_t tid = omp_get_thread_num();
+    const dim_t nt_real = omp_get_num_threads();
+
+    // if num threads requested and num thread available
+    // is not same then use single thread small
+    if(nt_real != n_threads)
+    {
+      if(tid == 0)
+      {
+        bli_trsm_small_AVX512
+            (
+              side,
+              alpha,
+              a,
+              b,
+              cntx,
+              cntl,
+              is_parallel
+            );
+      }
+    }
+    else
+    {
+      obj_t b_t;
+      dim_t start; // Each thread start Index
+      dim_t end;   // Each thread end Index
+      thrinfo_t thread;
+
+      thread.n_way = n_threads;
+      thread.work_id = tid;
+      thread.ocomm_id = tid;
+
+      // Compute start and end indexes of matrix partitioning for each thread
+      if (bli_is_right(side))
+      {
+        bli_thread_range_sub
+                (
+                  &thread,
+                  m,
+                  d_mr, // Need to decide based on type
+                  FALSE,
+                  &start,
+                  &end
+                );
+        // For each thread acquire matrix block on which they operate
+        // Data-based parallelism
+
+        bli_acquire_mpart_mdim(BLIS_FWD, BLIS_SUBPART1, start, end - start, b, &b_t);
+      }
+      else
+      {
+        bli_thread_range_sub
+                (
+                  &thread,
+                  n,
+                  d_nr,// Need to decide based on type
+                  FALSE,
+                  &start,
+                  &end
+                );
+        // For each thread acquire matrix block on which they operate
+        // Data-based parallelism
+
+        bli_acquire_mpart_ndim(BLIS_FWD, BLIS_SUBPART1, start, end - start, b, &b_t);
+      }
+
+      // Parallelism is only across m-dimension/n-dimension - therefore matrix a is common to
+      // all threads
+      err_t status_l = BLIS_SUCCESS;
+
+      status_l = bli_trsm_small_AVX512
+              (
+                side,
+                alpha,
+                a,
+                &b_t,
+                NULL,
+                NULL,
+                is_parallel
+              );
+      // To capture the error populated from any of the threads
+      if ( status_l != BLIS_SUCCESS )
+      {
+        _Pragma("omp critical")
+          status = (status != BLIS_NOT_YET_IMPLEMENTED) ? status_l : status;
+      }
+    }
+  }
+
+  return status;
+} // End of function
+#endif
+
+
+// region - GEMM DTRSM for right variants
+
+#define BLIS_DTRSM_SMALL_GEMM_8nx8m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
+  /*K loop is broken into two seperate loops
+    each loop computes k/2 iterations */ \
+  \
+  int itr = (k_iter / 2); /*itr count for first loop*/\
+  int itr2 = k_iter - itr; /*itr count for second loop*/\
+  double *a01_2 = a01 + itr; /*a01 for second loop*/\
+  double *b10_2 = b10 + (cs_b * itr); /*b10 for second loop*/\
+  for (; itr > 0; itr--) \
+  { \
+    zmm0 = _mm512_loadu_pd((double const *)b10); \
+    \
+    zmm1 = _mm512_set1_pd(*(a01 + (p_lda * 0))); \
+    zmm2 = _mm512_set1_pd(*(a01 + (p_lda * 1))); \
+    zmm3 = _mm512_set1_pd(*(a01 + (p_lda * 2))); \
+    zmm4 = _mm512_set1_pd(*(a01 + (p_lda * 3))); \
+    zmm5 = _mm512_set1_pd(*(a01 + (p_lda * 4))); \
+    zmm6 = _mm512_set1_pd(*(a01 + (p_lda * 5))); \
+    zmm7 = _mm512_set1_pd(*(a01 + (p_lda * 6))); \
+    zmm8 = _mm512_set1_pd(*(a01 + (p_lda * 7))); \
+    \
+    /*prefetch b10 4 iterations in advance*/ \
+    _mm_prefetch((b10 + 4 * cs_b), _MM_HINT_T0); \
+    zmm9  = _mm512_fmadd_pd(zmm1, zmm0, zmm9 ); \
+    zmm10 = _mm512_fmadd_pd(zmm2, zmm0, zmm10); \
+    zmm11 = _mm512_fmadd_pd(zmm3, zmm0, zmm11); \
+    zmm12 = _mm512_fmadd_pd(zmm4, zmm0, zmm12); \
+    zmm13 = _mm512_fmadd_pd(zmm5, zmm0, zmm13); \
+    zmm14 = _mm512_fmadd_pd(zmm6, zmm0, zmm14); \
+    zmm15 = _mm512_fmadd_pd(zmm7, zmm0, zmm15); \
+    zmm16 = _mm512_fmadd_pd(zmm8, zmm0, zmm16); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  } \
+  for (; itr2 > 0; itr2--) \
+  { \
+    zmm23 = _mm512_loadu_pd((double const *)b10_2); \
+    \
+    zmm17 = _mm512_set1_pd(*(a01_2 + (p_lda * 0))); \
+    zmm18 = _mm512_set1_pd(*(a01_2 + (p_lda * 1))); \
+    zmm19 = _mm512_set1_pd(*(a01_2 + (p_lda * 2))); \
+    zmm20 = _mm512_set1_pd(*(a01_2 + (p_lda * 3))); \
+    zmm21 = _mm512_set1_pd(*(a01_2 + (p_lda * 4))); \
+    zmm22 = _mm512_set1_pd(*(a01_2 + (p_lda * 5))); \
+    \
+    _mm_prefetch((b10_2 + 4 * cs_b), _MM_HINT_T0); \
+    zmm24 = _mm512_fmadd_pd(zmm17, zmm23, zmm24); \
+    zmm17 = _mm512_set1_pd(*(a01_2 + (p_lda * 6))); \
+    zmm25 = _mm512_fmadd_pd(zmm18, zmm23, zmm25); \
+    zmm18 = _mm512_set1_pd(*(a01_2 + (p_lda * 7))); \
+    zmm26 = _mm512_fmadd_pd(zmm19, zmm23, zmm26); \
+    zmm27 = _mm512_fmadd_pd(zmm20, zmm23, zmm27); \
+    zmm28 = _mm512_fmadd_pd(zmm21, zmm23, zmm28); \
+    zmm29 = _mm512_fmadd_pd(zmm22, zmm23, zmm29); \
+    zmm30 = _mm512_fmadd_pd(zmm17, zmm23, zmm30); \
+    zmm31 = _mm512_fmadd_pd(zmm18, zmm23, zmm31); \
+    \
+    a01_2 += 1; \
+    b10_2 += cs_b; \
+  } \
+  \
+  /*prefetch 8 columns of b11)*/ \
+  _mm_prefetch((b11 + (0) * cs_b), _MM_HINT_T0); \
+  /*combine the results of both loops*/ \
+  zmm9 = _mm512_add_pd(zmm9, zmm24); \
+  _mm_prefetch((b11 + (1) * cs_b), _MM_HINT_T0); \
+  zmm10 = _mm512_add_pd(zmm10, zmm25); \
+  _mm_prefetch((b11 + (2) * cs_b), _MM_HINT_T0); \
+  zmm11 = _mm512_add_pd(zmm11, zmm26); \
+  _mm_prefetch((b11 + (3) * cs_b), _MM_HINT_T0); \
+  zmm12 = _mm512_add_pd(zmm12, zmm27); \
+  _mm_prefetch((b11 + (4) * cs_b), _MM_HINT_T0); \
+  zmm13 = _mm512_add_pd(zmm13, zmm28); \
+  _mm_prefetch((b11 + (5) * cs_b), _MM_HINT_T0); \
+  zmm14 = _mm512_add_pd(zmm14, zmm29); \
+  _mm_prefetch((b11 + (6) * cs_b), _MM_HINT_T0); \
+  zmm15 = _mm512_add_pd(zmm15, zmm30); \
+  _mm_prefetch((b11 + (7) * cs_b), _MM_HINT_T0); \
+  zmm16 = _mm512_add_pd(zmm16, zmm31);
+/*
+// alternative way to prrefetch b11
+//  itr2 = itr2 + itr + 8; \
+//  for(;itr2>0;itr2--) \
+//   {\
+//   zmm23 = _mm512_loadu_pd((double const *)b10_2); \
+//   \
+//   zmm17 = _mm512_set1_pd(*(a01_2 + p_lda * 0)); \
+//   zmm18 = _mm512_set1_pd(*(a01_2 + p_lda * 1)); \
+//   zmm19 = _mm512_set1_pd(*(a01_2 + p_lda * 2)); \
+//   zmm20 = _mm512_set1_pd(*(a01_2 + p_lda * 3)); \
+//   zmm21 = _mm512_set1_pd(*(a01_2 + p_lda * 4)); \
+//   zmm22 = _mm512_set1_pd(*(a01_2 + p_lda * 5)); \
+//   \
+//   _mm_prefetch((b10_2 + 4*cs_b), _MM_HINT_T0); \
+//   _mm_prefetch((b11 + (itr2-1)*cs_b), _MM_HINT_T0); \
+//   zmm24 = _mm512_fmadd_pd(zmm17, zmm23, zmm24); \
+//   zmm17 = _mm512_set1_pd(*(a01_2 + p_lda * 6)); \
+//   zmm25 = _mm512_fmadd_pd(zmm18, zmm23, zmm25); \
+//   zmm18 = _mm512_set1_pd(*(a01_2 + p_lda * 7)); \
+//   zmm26 = _mm512_fmadd_pd(zmm19, zmm23, zmm26); \
+//   zmm27 = _mm512_fmadd_pd(zmm20, zmm23, zmm27); \
+//   zmm28 = _mm512_fmadd_pd(zmm21, zmm23, zmm28); \
+//   zmm29 = _mm512_fmadd_pd(zmm22, zmm23, zmm29); \
+//   zmm30 = _mm512_fmadd_pd(zmm17, zmm23, zmm30); \
+//   zmm31 = _mm512_fmadd_pd(zmm18, zmm23, zmm31); \
+//   \
+//   a01_2 += 1;\
+//   b10_2 += cs_b; \
+//   }\
+*/
+/*
+// alternative version of main loop
+#define BLIS_DTRSM_SMALL_GEMM_8nx8m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
+  int itr = k_iter - 8; \
+  for(;itr>0;itr--) \
+  {\
+  zmm0 = _mm512_loadu_pd((double const *)b10); \
+  \
+  zmm1 = _mm512_set1_pd(*(a01 + p_lda * 0)); \
+  zmm2 = _mm512_set1_pd(*(a01 + p_lda * 1)); \
+  zmm3 = _mm512_set1_pd(*(a01 + p_lda * 2)); \
+  zmm4 = _mm512_set1_pd(*(a01 + p_lda * 3)); \
+  zmm5 = _mm512_set1_pd(*(a01 + p_lda * 4)); \
+  zmm6 = _mm512_set1_pd(*(a01 + p_lda * 5)); \
+  zmm7 = _mm512_set1_pd(*(a01 + p_lda * 6)); \
+  zmm8 = _mm512_set1_pd(*(a01 + p_lda * 7)); \
+  \
+  _mm_prefetch((b10 + 4*cs_b), _MM_HINT_T0); \
+  zmm9  = _mm512_fmadd_pd(zmm1, zmm0, zmm9 ); \
+  zmm10 = _mm512_fmadd_pd(zmm2, zmm0, zmm10); \
+  zmm11 = _mm512_fmadd_pd(zmm3, zmm0, zmm11); \
+  zmm12 = _mm512_fmadd_pd(zmm4, zmm0, zmm12); \
+  zmm13 = _mm512_fmadd_pd(zmm5, zmm0, zmm13); \
+  zmm14 = _mm512_fmadd_pd(zmm6, zmm0, zmm14); \
+  zmm15 = _mm512_fmadd_pd(zmm7, zmm0, zmm15); \
+  zmm16 = _mm512_fmadd_pd(zmm8, zmm0, zmm16); \
+  \
+  a01 += 1;\
+  b10 += cs_b; \
+  }\
+  itr += 8; \
+  for(;itr>0;itr--) \
+  {\
+  zmm0 = _mm512_loadu_pd((double const *)b10); \
+  \
+  zmm1 = _mm512_set1_pd(*(a01 + p_lda * 0)); \
+  zmm2 = _mm512_set1_pd(*(a01 + p_lda * 1)); \
+  zmm3 = _mm512_set1_pd(*(a01 + p_lda * 2)); \
+  zmm4 = _mm512_set1_pd(*(a01 + p_lda * 3)); \
+  zmm5 = _mm512_set1_pd(*(a01 + p_lda * 4)); \
+  zmm6 = _mm512_set1_pd(*(a01 + p_lda * 5)); \
+  zmm7 = _mm512_set1_pd(*(a01 + p_lda * 6)); \
+  zmm8 = _mm512_set1_pd(*(a01 + p_lda * 7)); \
+  \
+  _mm_prefetch((b10 + 4*cs_b), _MM_HINT_T0); \
+  _mm_prefetch((b11 + (itr-1)*cs_b), _MM_HINT_T0); \
+  zmm9  = _mm512_fmadd_pd(zmm1, zmm0, zmm9 ); \
+  zmm10 = _mm512_fmadd_pd(zmm2, zmm0, zmm10); \
+  zmm11 = _mm512_fmadd_pd(zmm3, zmm0, zmm11); \
+  zmm12 = _mm512_fmadd_pd(zmm4, zmm0, zmm12); \
+  zmm13 = _mm512_fmadd_pd(zmm5, zmm0, zmm13); \
+  zmm14 = _mm512_fmadd_pd(zmm6, zmm0, zmm14); \
+  zmm15 = _mm512_fmadd_pd(zmm7, zmm0, zmm15); \
+  zmm16 = _mm512_fmadd_pd(zmm8, zmm0, zmm16); \
+  \
+  a01 += 1;\
+  b10 += cs_b; \
+  }\
+*/
+
+#define BLIS_DTRSM_SMALL_GEMM_8nx4m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
+  /*K loop is broken into two seperate loops
+    each loop computes k/2 iterations */ \
+  \
+  int itr = (k_iter / 2); /*itr count for first loop*/\
+  int itr2 = k_iter - itr; /*itr count for second loop*/\
+  double *a01_2 = a01 + itr; /*a01 for second loop*/\
+  double *b10_2 = b10 + (cs_b * itr); /*b10 for second loop*/\
+  for (; itr > 0; itr--) \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(b10)); \
+    \
+    ymm1 = _mm256_broadcast_sd((a01 + (p_lda * 0))); \
+    ymm2 = _mm256_broadcast_sd((a01 + (p_lda * 1))); \
+    ymm3 = _mm256_broadcast_sd((a01 + (p_lda * 2))); \
+    ymm4 = _mm256_broadcast_sd((a01 + (p_lda * 3))); \
+    ymm5 = _mm256_broadcast_sd((a01 + (p_lda * 4))); \
+    ymm6 = _mm256_broadcast_sd((a01 + (p_lda * 5))); \
+    ymm7 = _mm256_broadcast_sd((a01 + (p_lda * 6))); \
+    ymm8 = _mm256_broadcast_sd((a01 + (p_lda * 7))); \
+    \
+    _mm_prefetch((b10 + 4 * cs_b), _MM_HINT_T0); \
+    ymm9  = _mm256_fmadd_pd(ymm1, ymm0, ymm9 ); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    ymm11 = _mm256_fmadd_pd(ymm3, ymm0, ymm11); \
+    ymm12 = _mm256_fmadd_pd(ymm4, ymm0, ymm12); \
+    ymm13 = _mm256_fmadd_pd(ymm5, ymm0, ymm13); \
+    ymm14 = _mm256_fmadd_pd(ymm6, ymm0, ymm14); \
+    ymm15 = _mm256_fmadd_pd(ymm7, ymm0, ymm15); \
+    ymm16 = _mm256_fmadd_pd(ymm8, ymm0, ymm16); \
+    \
+    a01 += 1; \
+    b10 += cs_b; \
+  } \
+  for (; itr2 > 0; itr2--) \
+  { \
+    ymm23 = _mm256_loadu_pd((double const *)(b10_2)); \
+    \
+    ymm17 = _mm256_broadcast_sd((a01_2 + (p_lda * 0))); \
+    ymm18 = _mm256_broadcast_sd((a01_2 + (p_lda * 1))); \
+    ymm19 = _mm256_broadcast_sd((a01_2 + (p_lda * 2))); \
+    ymm20 = _mm256_broadcast_sd((a01_2 + (p_lda * 3))); \
+    ymm21 = _mm256_broadcast_sd((a01_2 + (p_lda * 4))); \
+    ymm22 = _mm256_broadcast_sd((a01_2 + (p_lda * 5))); \
+    \
+    _mm_prefetch((b10_2 + 4 * cs_b), _MM_HINT_T0); \
+    ymm24 = _mm256_fmadd_pd(ymm17, ymm23, ymm24); \
+    ymm17 = _mm256_broadcast_sd((a01_2 + (p_lda * 6))); \
+    ymm25 = _mm256_fmadd_pd(ymm18, ymm23, ymm25); \
+    ymm18 = _mm256_broadcast_sd((a01_2 + (p_lda * 7))); \
+    ymm26 = _mm256_fmadd_pd(ymm19, ymm23, ymm26); \
+    ymm27 = _mm256_fmadd_pd(ymm20, ymm23, ymm27); \
+    ymm28 = _mm256_fmadd_pd(ymm21, ymm23, ymm28); \
+    ymm29 = _mm256_fmadd_pd(ymm22, ymm23, ymm29); \
+    ymm30 = _mm256_fmadd_pd(ymm17, ymm23, ymm30); \
+    ymm31 = _mm256_fmadd_pd(ymm18, ymm23, ymm31); \
+    \
+    a01_2 += 1; \
+    b10_2 += cs_b; \
+  } \
+  /*combine the results of both loops*/ \
+  _mm_prefetch((b11 + (0) * cs_b), _MM_HINT_T0); \
+  ymm9  = _mm256_add_pd(ymm9, ymm24); \
+  _mm_prefetch((b11 + (1) * cs_b), _MM_HINT_T0); \
+  ymm10 = _mm256_add_pd(ymm10, ymm25); \
+  _mm_prefetch((b11 + (2) * cs_b), _MM_HINT_T0); \
+  ymm11 = _mm256_add_pd(ymm11, ymm26); \
+  _mm_prefetch((b11 + (3) * cs_b), _MM_HINT_T0); \
+  ymm12 = _mm256_add_pd(ymm12, ymm27); \
+  _mm_prefetch((b11 + (4) * cs_b), _MM_HINT_T0); \
+  ymm13 = _mm256_add_pd(ymm13, ymm28); \
+  _mm_prefetch((b11 + (5) * cs_b), _MM_HINT_T0); \
+  ymm14 = _mm256_add_pd(ymm14, ymm29); \
+  _mm_prefetch((b11 + (6) * cs_b), _MM_HINT_T0); \
+  ymm15 = _mm256_add_pd(ymm15, ymm30); \
+  _mm_prefetch((b11 + (7) * cs_b), _MM_HINT_T0); \
+  ymm16 = _mm256_add_pd(ymm16, ymm31);
+
+
+#define BLIS_DTRSM_SMALL_GEMM_8nx3m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
+  /*K loop is broken into two seperate loops
+    each loop computes k/2 iterations */ \
+  \
+  int itr = (k_iter / 2); /*itr count for first loop*/\
+  int itr2 = k_iter - itr; /*itr count for second loop*/\
+  double *a01_2 = a01 + itr; /*a01 for second loop*/\
+  double *b10_2 = b10 + (cs_b * itr); /*b10 for second loop*/\
+  for (; itr > 0; itr--) \
+  { \
+    xmm5 = _mm_loadu_pd((b10)); /*load b10[0] and b10[1] into xmm5*/\
+    ymm0 = _mm256_broadcast_sd((b10 + 2)); /*broadcast b10[2] into ymm0*/\
+    ymm0 = _mm256_insertf64x2(ymm0, xmm5, 0); \
+    /*ymm0 = {b10[0], b10[1], b10[2], b10[2]}*/\
+    \
+    ymm1 = _mm256_broadcast_sd((a01 + (p_lda * 0))); \
+    ymm2 = _mm256_broadcast_sd((a01 + (p_lda * 1))); \
+    ymm3 = _mm256_broadcast_sd((a01 + (p_lda * 2))); \
+    ymm4 = _mm256_broadcast_sd((a01 + (p_lda * 3))); \
+    ymm5 = _mm256_broadcast_sd((a01 + (p_lda * 4))); \
+    ymm6 = _mm256_broadcast_sd((a01 + (p_lda * 5))); \
+    ymm7 = _mm256_broadcast_sd((a01 + (p_lda * 6))); \
+    ymm8 = _mm256_broadcast_sd((a01 + (p_lda * 7))); \
+    \
+    _mm_prefetch((b10 + 4 * cs_b), _MM_HINT_T0); \
+    ymm9  = _mm256_fmadd_pd(ymm1, ymm0, ymm9 ); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    ymm11 = _mm256_fmadd_pd(ymm3, ymm0, ymm11); \
+    ymm12 = _mm256_fmadd_pd(ymm4, ymm0, ymm12); \
+    ymm13 = _mm256_fmadd_pd(ymm5, ymm0, ymm13); \
+    ymm14 = _mm256_fmadd_pd(ymm6, ymm0, ymm14); \
+    ymm15 = _mm256_fmadd_pd(ymm7, ymm0, ymm15); \
+    ymm16 = _mm256_fmadd_pd(ymm8, ymm0, ymm16); \
+    \
+    a01 += 1; \
+    b10 += cs_b; \
+  } \
+  for (; itr2 > 0; itr2--) \
+  { \
+    xmm0 = _mm_loadu_pd((b10_2)); \
+    ymm23 = _mm256_broadcast_sd((b10_2 + 2)); \
+    ymm23 = _mm256_insertf64x2(ymm23, xmm0, 0); \
+    \
+    ymm17 = _mm256_broadcast_sd((a01_2 + (p_lda * 0))); \
+    ymm18 = _mm256_broadcast_sd((a01_2 + (p_lda * 1))); \
+    ymm19 = _mm256_broadcast_sd((a01_2 + (p_lda * 2))); \
+    ymm20 = _mm256_broadcast_sd((a01_2 + (p_lda * 3))); \
+    ymm21 = _mm256_broadcast_sd((a01_2 + (p_lda * 4))); \
+    ymm22 = _mm256_broadcast_sd((a01_2 + (p_lda * 5))); \
+    \
+    _mm_prefetch((b10_2 + 4 * cs_b), _MM_HINT_T0); \
+    ymm24 = _mm256_fmadd_pd(ymm17, ymm23, ymm24); \
+    ymm17 = _mm256_broadcast_sd((a01_2 + (p_lda * 6))); \
+    ymm25 = _mm256_fmadd_pd(ymm18, ymm23, ymm25); \
+    ymm18 = _mm256_broadcast_sd((a01_2 + (p_lda * 7))); \
+    ymm26 = _mm256_fmadd_pd(ymm19, ymm23, ymm26); \
+    ymm27 = _mm256_fmadd_pd(ymm20, ymm23, ymm27); \
+    ymm28 = _mm256_fmadd_pd(ymm21, ymm23, ymm28); \
+    ymm29 = _mm256_fmadd_pd(ymm22, ymm23, ymm29); \
+    ymm30 = _mm256_fmadd_pd(ymm17, ymm23, ymm30); \
+    ymm31 = _mm256_fmadd_pd(ymm18, ymm23, ymm31); \
+    \
+    a01_2 += 1; \
+    b10_2 += cs_b; \
+  } \
+  /*combine the results of both loops*/ \
+  _mm_prefetch((b11 + (0) * cs_b), _MM_HINT_T0); \
+  ymm9  = _mm256_add_pd(ymm9, ymm24); \
+  _mm_prefetch((b11 + (1) * cs_b), _MM_HINT_T0); \
+  ymm10 = _mm256_add_pd(ymm10, ymm25); \
+  _mm_prefetch((b11 + (2) * cs_b), _MM_HINT_T0); \
+  ymm11 = _mm256_add_pd(ymm11, ymm26); \
+  _mm_prefetch((b11 + (3) * cs_b), _MM_HINT_T0); \
+  ymm12 = _mm256_add_pd(ymm12, ymm27); \
+  _mm_prefetch((b11 + (4) * cs_b), _MM_HINT_T0); \
+  ymm13 = _mm256_add_pd(ymm13, ymm28); \
+  _mm_prefetch((b11 + (5) * cs_b), _MM_HINT_T0); \
+  ymm14 = _mm256_add_pd(ymm14, ymm29); \
+  _mm_prefetch((b11 + (6) * cs_b), _MM_HINT_T0); \
+  ymm15 = _mm256_add_pd(ymm15, ymm30); \
+  _mm_prefetch((b11 + (7) * cs_b), _MM_HINT_T0); \
+  ymm16 = _mm256_add_pd(ymm16, ymm31);
+
+  #define BLIS_DTRSM_SMALL_GEMM_8nx2m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
+  /*K loop is broken into two seperate loops
+    each loop computes k/2 iterations */ \
+  \
+  int itr = (k_iter / 2); /*itr count for first loop*/\
+  int itr2 = k_iter - itr; /*itr count for second loop*/\
+  double *a01_2 = a01 + itr; /*a01 for second loop*/\
+  double *b10_2 = b10 + (cs_b * itr); /*b10 for second loop*/\
+  for (; itr > 0; itr--) \
+  { \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_insertf64x2(ymm0, xmm5, 0); \
+    \
+    ymm1 = _mm256_broadcast_sd((a01 + (p_lda * 0))); \
+    ymm2 = _mm256_broadcast_sd((a01 + (p_lda * 1))); \
+    ymm3 = _mm256_broadcast_sd((a01 + (p_lda * 2))); \
+    ymm4 = _mm256_broadcast_sd((a01 + (p_lda * 3))); \
+    ymm5 = _mm256_broadcast_sd((a01 + (p_lda * 4))); \
+    ymm6 = _mm256_broadcast_sd((a01 + (p_lda * 5))); \
+    ymm7 = _mm256_broadcast_sd((a01 + (p_lda * 6))); \
+    ymm8 = _mm256_broadcast_sd((a01 + (p_lda * 7))); \
+    \
+    _mm_prefetch((b10 + 4 * cs_b), _MM_HINT_T0); \
+    ymm9  = _mm256_fmadd_pd(ymm1, ymm0, ymm9 ); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    ymm11 = _mm256_fmadd_pd(ymm3, ymm0, ymm11); \
+    ymm12 = _mm256_fmadd_pd(ymm4, ymm0, ymm12); \
+    ymm13 = _mm256_fmadd_pd(ymm5, ymm0, ymm13); \
+    ymm14 = _mm256_fmadd_pd(ymm6, ymm0, ymm14); \
+    ymm15 = _mm256_fmadd_pd(ymm7, ymm0, ymm15); \
+    ymm16 = _mm256_fmadd_pd(ymm8, ymm0, ymm16); \
+    \
+    a01 += 1; \
+    b10 += cs_b; \
+  } \
+  for (; itr2 > 0; itr2--) \
+  { \
+    xmm0 = _mm_loadu_pd((double const *)(b10_2)); \
+    ymm23 = _mm256_insertf64x2(ymm23, xmm0, 0); \
+    \
+    ymm17 = _mm256_broadcast_sd((a01_2 + (p_lda * 0))); \
+    ymm18 = _mm256_broadcast_sd((a01_2 + (p_lda * 1))); \
+    ymm19 = _mm256_broadcast_sd((a01_2 + (p_lda * 2))); \
+    ymm20 = _mm256_broadcast_sd((a01_2 + (p_lda * 3))); \
+    ymm21 = _mm256_broadcast_sd((a01_2 + (p_lda * 4))); \
+    ymm22 = _mm256_broadcast_sd((a01_2 + (p_lda * 5))); \
+    \
+    _mm_prefetch((b10_2 + 4 * cs_b), _MM_HINT_T0); \
+    ymm24 = _mm256_fmadd_pd(ymm17, ymm23, ymm24); \
+    ymm17 = _mm256_broadcast_sd((a01_2 + (p_lda * 6))); \
+    ymm25 = _mm256_fmadd_pd(ymm18, ymm23, ymm25); \
+    ymm18 = _mm256_broadcast_sd((a01_2 + (p_lda * 7))); \
+    ymm26 = _mm256_fmadd_pd(ymm19, ymm23, ymm26); \
+    ymm27 = _mm256_fmadd_pd(ymm20, ymm23, ymm27); \
+    ymm28 = _mm256_fmadd_pd(ymm21, ymm23, ymm28); \
+    ymm29 = _mm256_fmadd_pd(ymm22, ymm23, ymm29); \
+    ymm30 = _mm256_fmadd_pd(ymm17, ymm23, ymm30); \
+    ymm31 = _mm256_fmadd_pd(ymm18, ymm23, ymm31); \
+    \
+    a01_2 += 1; \
+    b10_2 += cs_b; \
+  } \
+  /*combine the results of both loops*/ \
+  _mm_prefetch((b11 + (0) * cs_b), _MM_HINT_T0); \
+  ymm9 = _mm256_add_pd(ymm9, ymm24); \
+  _mm_prefetch((b11 + (1) * cs_b), _MM_HINT_T0); \
+  ymm10 = _mm256_add_pd(ymm10, ymm25); \
+  _mm_prefetch((b11 + (2) * cs_b), _MM_HINT_T0); \
+  ymm11 = _mm256_add_pd(ymm11, ymm26); \
+  _mm_prefetch((b11 + (3) * cs_b), _MM_HINT_T0); \
+  ymm12 = _mm256_add_pd(ymm12, ymm27); \
+  _mm_prefetch((b11 + (4) * cs_b), _MM_HINT_T0); \
+  ymm13 = _mm256_add_pd(ymm13, ymm28); \
+  _mm_prefetch((b11 + (5) * cs_b), _MM_HINT_T0); \
+  ymm14 = _mm256_add_pd(ymm14, ymm29); \
+  _mm_prefetch((b11 + (6) * cs_b), _MM_HINT_T0); \
+  ymm15 = _mm256_add_pd(ymm15, ymm30); \
+  _mm_prefetch((b11 + (7) * cs_b), _MM_HINT_T0); \
+  ymm16 = _mm256_add_pd(ymm16, ymm31);
+
+#define BLIS_DTRSM_SMALL_GEMM_8nx1m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11) \
+  /*K loop is broken into two seperate loops
+    each loop computes k/2 iterations */ \
+  \
+  int itr = (k_iter / 2); /*itr count for first loop*/\
+  int itr2 = k_iter - itr; /*itr count for second loop*/\
+  double *a01_2 = a01 + itr; /*a01 for second loop*/\
+  double *b10_2 = b10 + (cs_b * itr); /*b10 for second loop*/\
+  for (; itr > 0; itr--) \
+  { \
+    ymm0 = _mm256_broadcast_sd(b10); \
+    \
+    ymm1 = _mm256_broadcast_sd((a01 + (p_lda * 0))); \
+    ymm2 = _mm256_broadcast_sd((a01 + (p_lda * 1))); \
+    ymm3 = _mm256_broadcast_sd((a01 + (p_lda * 2))); \
+    ymm4 = _mm256_broadcast_sd((a01 + (p_lda * 3))); \
+    ymm5 = _mm256_broadcast_sd((a01 + (p_lda * 4))); \
+    ymm6 = _mm256_broadcast_sd((a01 + (p_lda * 5))); \
+    ymm7 = _mm256_broadcast_sd((a01 + (p_lda * 6))); \
+    ymm8 = _mm256_broadcast_sd((a01 + (p_lda * 7))); \
+    \
+    _mm_prefetch((b10 + 4 * cs_b), _MM_HINT_T0); \
+    ymm9  = _mm256_fmadd_pd(ymm1, ymm0, ymm9 ); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    ymm11 = _mm256_fmadd_pd(ymm3, ymm0, ymm11); \
+    ymm12 = _mm256_fmadd_pd(ymm4, ymm0, ymm12); \
+    ymm13 = _mm256_fmadd_pd(ymm5, ymm0, ymm13); \
+    ymm14 = _mm256_fmadd_pd(ymm6, ymm0, ymm14); \
+    ymm15 = _mm256_fmadd_pd(ymm7, ymm0, ymm15); \
+    ymm16 = _mm256_fmadd_pd(ymm8, ymm0, ymm16); \
+    \
+    a01 += 1; \
+    b10 += cs_b; \
+  } \
+  for (; itr2 > 0; itr2--) \
+  { \
+    ymm23 = _mm256_broadcast_sd(b10_2); \
+    \
+    ymm17 = _mm256_broadcast_sd((a01_2 + (p_lda * 0))); \
+    ymm18 = _mm256_broadcast_sd((a01_2 + (p_lda * 1))); \
+    ymm19 = _mm256_broadcast_sd((a01_2 + (p_lda * 2))); \
+    ymm20 = _mm256_broadcast_sd((a01_2 + (p_lda * 3))); \
+    ymm21 = _mm256_broadcast_sd((a01_2 + (p_lda * 4))); \
+    ymm22 = _mm256_broadcast_sd((a01_2 + (p_lda * 5))); \
+    \
+    _mm_prefetch((b10_2 + 4 * cs_b), _MM_HINT_T0); \
+    ymm24 = _mm256_fmadd_pd(ymm17, ymm23, ymm24); \
+    ymm17 = _mm256_broadcast_sd((a01_2 + (p_lda * 6))); \
+    ymm25 = _mm256_fmadd_pd(ymm18, ymm23, ymm25); \
+    ymm18 = _mm256_broadcast_sd((a01_2 + (p_lda * 7))); \
+    ymm26 = _mm256_fmadd_pd(ymm19, ymm23, ymm26); \
+    ymm27 = _mm256_fmadd_pd(ymm20, ymm23, ymm27); \
+    ymm28 = _mm256_fmadd_pd(ymm21, ymm23, ymm28); \
+    ymm29 = _mm256_fmadd_pd(ymm22, ymm23, ymm29); \
+    ymm30 = _mm256_fmadd_pd(ymm17, ymm23, ymm30); \
+    ymm31 = _mm256_fmadd_pd(ymm18, ymm23, ymm31); \
+    \
+    a01_2 += 1; \
+    b10_2 += cs_b; \
+  } \
+  /*combine the results of both loops*/ \
+  _mm_prefetch((b11 + (0) * cs_b), _MM_HINT_T0); \
+  ymm9 = _mm256_add_pd(ymm9, ymm24); \
+  _mm_prefetch((b11 + (1) * cs_b), _MM_HINT_T0); \
+  ymm10 = _mm256_add_pd(ymm10, ymm25); \
+  _mm_prefetch((b11 + (2) * cs_b), _MM_HINT_T0); \
+  ymm11 = _mm256_add_pd(ymm11, ymm26); \
+  _mm_prefetch((b11 + (3) * cs_b), _MM_HINT_T0); \
+  ymm12 = _mm256_add_pd(ymm12, ymm27); \
+  _mm_prefetch((b11 + (4) * cs_b), _MM_HINT_T0); \
+  ymm13 = _mm256_add_pd(ymm13, ymm28); \
+  _mm_prefetch((b11 + (5) * cs_b), _MM_HINT_T0); \
+  ymm14 = _mm256_add_pd(ymm14, ymm29); \
+  _mm_prefetch((b11 + (6) * cs_b), _MM_HINT_T0); \
+  ymm15 = _mm256_add_pd(ymm15, ymm30); \
+  _mm_prefetch((b11 + (7) * cs_b), _MM_HINT_T0); \
+  ymm16 = _mm256_add_pd(ymm16, ymm31);
+
+
+
+#define BLIS_DTRSM_SMALL_GEMM_4nx8m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 8x1 block of B10*/ \
+    ymm0 = _mm256_loadu_pd((double const *)b10); \
+    ymm1 = _mm256_loadu_pd((double const *)(b10 + 4)); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 3)); /*A01[0][3]*/ \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm1, ymm10); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4nx4m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 4x1 block of B10*/ \
+    ymm0 = _mm256_loadu_pd((double const *)b10); /*B10[0][0] B10[1][0] B10[2][0] B10[3][0]*/ \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 3)); /*A01[0][3]*/ \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4nx3m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 3x1 block of B10*/ \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_broadcast_sd((double const *)(b10 + 2)); \
+    ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 3)); /*A01[0][3]*/ \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4nx2m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 2x1 block of B10*/ \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 3)); /*A01[0][3]*/ \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4nx1m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 1x1 block of B10*/ \
+    ymm0 = _mm256_broadcast_sd((double const *)b10); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 3)); /*A01[0][3]*/ \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_3nx8m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 8x1 block of B10*/ \
+    ymm0 = _mm256_loadu_pd((double const *)b10); \
+    ymm1 = _mm256_loadu_pd((double const *)(b10 + 4)); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm1, ymm8); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_3nx4m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 4x1 block of B10*/ \
+    ymm0 = _mm256_loadu_pd((double const *)b10); /*B10[0][0] B10[1][0] B10[2][0] B10[3][0]*/ \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_3nx3m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 3x1 block of B10*/ \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_broadcast_sd((double const *)(b10 + 2)); \
+    ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_3nx2m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 2x1 block of B10*/ \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_3nx1m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 1x1 block of B10*/ \
+    ymm0 = _mm256_broadcast_sd((double const *)b10); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 2)); /*A01[0][2]*/ \
+    ymm7 = _mm256_fmadd_pd(ymm2, ymm0, ymm7); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_2nx8m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 8x1 block of B10*/ \
+    ymm0 = _mm256_loadu_pd((double const *)b10);     /*B10[0][0] B10[1][0] B10[2][0] B10[3][0]*/ \
+    ymm1 = _mm256_loadu_pd((double const *)(b10 + 4)); /*B10[4][0] B10[5][0] B10[6][0] B10[7][0]*/ \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    ymm6 = _mm256_fmadd_pd(ymm2, ymm1, ymm6); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_2nx4m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 4x1 block of B10*/ \
+    ymm0 = _mm256_loadu_pd((double const *)b10); /*B10[0][0] B10[1][0] B10[2][0] B10[3][0]*/ \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_2nx3m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 3x1 block of B10*/ \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_broadcast_sd((double const *)(b10 + 2)); \
+    ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_2nx2m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 2x1 block of B10*/ \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+   \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+   \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+   \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_2nx1m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 1x1 block of B10*/ \
+    ymm0 = _mm256_broadcast_sd((double const *)b10); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 1)); /*A01[0][1]*/ \
+    ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_1nx8m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 8x1 block of B10*/ \
+    ymm0 = _mm256_loadu_pd((double const *)b10);     /*B10[0][0] B10[1][0] B10[2][0] B10[3][0]*/ \
+    ymm1 = _mm256_loadu_pd((double const *)(b10 + 4)); /*B10[4][0] B10[5][0] B10[6][0] B10[7][0]*/ \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    ymm4 = _mm256_fmadd_pd(ymm2, ymm1, ymm4); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_1nx4m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 4x1 block of B10*/ \
+    ymm0 = _mm256_loadu_pd((double const *)b10); /*B10[0][0] B10[1][0] B10[2][0] B10[3][0]*/ \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_1nx3m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 3x1 block of B10*/ \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_broadcast_sd((double const *)(b10 + 2)); \
+    ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_1nx2m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 2x1 block of B10*/ \
+    xmm5 = _mm_loadu_pd((double const *)(b10)); \
+    ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_1nx1m(a01, b10, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    /*load 1x1 block of B10*/ \
+    ymm0 = _mm256_broadcast_sd((double const *)b10); \
+    \
+    /*broadcast 1st row of A01*/ \
+    ymm2 = _mm256_broadcast_sd((double const *)(a01 + p_lda * 0)); /*A01[0][0]*/ \
+    ymm3 = _mm256_fmadd_pd(ymm2, ymm0, ymm3); \
+    \
+    a01 += 1; /*move to next row*/ \
+    b10 += cs_b; \
+  }
+
+// endregion - GEMM DTRSM for right variants
+
+// region - pre/post DTRSM macros for right variants
+
+#define BLIS_PRE_DTRSM_SMALL_8x8(AlphaVal, b11, cs_b) \
+  /*gemm_output = (B11 * alpha) - gemm_output*/ \
+  zmm31 = _mm512_set1_pd(AlphaVal); \
+  \
+  zmm0  = _mm512_loadu_pd((double const *)(b11 + (0*cs_b))); \
+  zmm9  = _mm512_fmsub_pd(zmm0, zmm31, zmm9 ); /*zmm9  = (zmm0 * zmm31) - zmm9*/\
+  \
+  zmm1  = _mm512_loadu_pd((double const *)(b11 + (1*cs_b))); \
+  zmm10 = _mm512_fmsub_pd(zmm1, zmm31, zmm10); /*zmm10 = (zmm1 * zmm31) - zmm10*/\
+  \
+  zmm2  = _mm512_loadu_pd((double const *)(b11 + (2*cs_b))); \
+  zmm11 = _mm512_fmsub_pd(zmm2, zmm31, zmm11); /*zmm11 = (zmm2 * zmm31) - zmm11*/\
+  \
+  zmm3  = _mm512_loadu_pd((double const *)(b11 + (3*cs_b))); \
+  zmm12 = _mm512_fmsub_pd(zmm3, zmm31, zmm12); /*zmm12 = (zmm3 * zmm31) - zmm12*/\
+  \
+  zmm4  = _mm512_loadu_pd((double const *)(b11 + (4*cs_b))); \
+  zmm13 = _mm512_fmsub_pd(zmm4, zmm31, zmm13); /*zmm13 = (zmm4 * zmm31) - zmm13*/\
+  \
+  zmm5  = _mm512_loadu_pd((double const *)(b11 + (5*cs_b))); \
+  zmm14 = _mm512_fmsub_pd(zmm5, zmm31, zmm14); /*zmm14 = (zmm5 * zmm31) - zmm14*/\
+  \
+  zmm6  = _mm512_loadu_pd((double const *)(b11 + (6*cs_b))); \
+  zmm15 = _mm512_fmsub_pd(zmm6, zmm31, zmm15); /*zmm15 = (zmm6 * zmm31) - zmm15*/\
+  \
+  zmm7  = _mm512_loadu_pd((double const *)(b11 + (7*cs_b))); \
+  zmm16 = _mm512_fmsub_pd(zmm7, zmm31, zmm16);
+
+#define BLIS_PRE_DTRSM_SMALL_8x4(AlphaVal, b11, cs_b) \
+  /*gemm_output = (B11 * alpha) - gemm_output*/ \
+  ymm31 = _mm256_broadcast_sd(&AlphaVal); \
+  \
+  ymm0  = _mm256_loadu_pd((double const *)(b11 + 0 * cs_b)); \
+  ymm9  = _mm256_fmsub_pd(ymm0, ymm31, ymm9); \
+  \
+  ymm1  = _mm256_loadu_pd((double const *)(b11 + 1 * cs_b)); \
+  ymm10 = _mm256_fmsub_pd(ymm1, ymm31, ymm10); \
+  \
+  ymm2  = _mm256_loadu_pd((double const *)(b11 + 2 * cs_b)); \
+  ymm11 = _mm256_fmsub_pd(ymm2, ymm31, ymm11); \
+  \
+  ymm3  = _mm256_loadu_pd((double const *)(b11 + 3 * cs_b)); \
+  ymm12 = _mm256_fmsub_pd(ymm3, ymm31, ymm12); \
+  \
+  ymm4  = _mm256_loadu_pd((double const *)(b11 + 4 * cs_b)); \
+  ymm13 = _mm256_fmsub_pd(ymm4, ymm31, ymm13); \
+  \
+  ymm5  = _mm256_loadu_pd((double const *)(b11 + 5 * cs_b)); \
+  ymm14 = _mm256_fmsub_pd(ymm5, ymm31, ymm14); \
+  \
+  ymm6  = _mm256_loadu_pd((double const *)(b11 + 6 * cs_b)); \
+  ymm15 = _mm256_fmsub_pd(ymm6, ymm31, ymm15); \
+  \
+  ymm7  = _mm256_loadu_pd((double const *)(b11 + 7 * cs_b)); \
+  ymm16 = _mm256_fmsub_pd(ymm7, ymm31, ymm16);
+
+#define BLIS_PRE_DTRSM_SMALL_8x3(AlphaVal, b11, cs_b) \
+  /*gemm_output = (B11 * alpha) - gemm_output*/ \
+  ymm31 = _mm256_broadcast_sd(&AlphaVal); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 0 * cs_b)); /*xmm5 = {b11[0], b11[1]}*/\
+  ymm0 = _mm256_broadcast_sd((b11 + 2 + 0 * cs_b)); /*ymm0 = {b11[2], b11[2], b11[2], b11[2]}*/\
+  ymm0 = _mm256_insertf64x2(ymm0, xmm5, 0); /*ymm0 = {b11[0], b11[1], b11[2], b11[2]}*/\
+  ymm9 = _mm256_fmsub_pd(ymm0, ymm31, ymm9); /*ymm9 = (ymm0 * ymm31) - ymm9*/\
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 1 * cs_b)); \
+  ymm1 = _mm256_broadcast_sd((b11 + 2 + 1 * cs_b)); \
+  ymm1 = _mm256_insertf64x2(ymm1, xmm5, 0); \
+  ymm10 = _mm256_fmsub_pd(ymm1, ymm31, ymm10); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 2 * cs_b)); \
+  ymm2 = _mm256_broadcast_sd((b11 + 2 + 2 * cs_b)); \
+  ymm2 = _mm256_insertf64x2(ymm2, xmm5, 0); \
+  ymm11 = _mm256_fmsub_pd(ymm2, ymm31, ymm11); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 3 * cs_b)); \
+  ymm3 = _mm256_broadcast_sd((b11 + 2 + 3 * cs_b)); \
+  ymm3 = _mm256_insertf64x2(ymm3, xmm5, 0); \
+  ymm12 = _mm256_fmsub_pd(ymm3, ymm31, ymm12); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 4 * cs_b)); \
+  ymm4 = _mm256_broadcast_sd((b11 + 2 + 4 * cs_b)); \
+  ymm4 = _mm256_insertf64x2(ymm4, xmm5, 0); \
+  ymm13 = _mm256_fmsub_pd(ymm4, ymm31, ymm13); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 5 * cs_b)); \
+  ymm5 = _mm256_broadcast_sd((b11 + 2 + 5 * cs_b)); \
+  ymm5 = _mm256_insertf64x2(ymm5, xmm5, 0); \
+  ymm14 = _mm256_fmsub_pd(ymm5, ymm31, ymm14); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 6 * cs_b)); \
+  ymm6 = _mm256_broadcast_sd((b11 + 2 + 6 * cs_b)); \
+  ymm6 = _mm256_insertf64x2(ymm6, xmm5, 0); \
+  ymm15 = _mm256_fmsub_pd(ymm6, ymm31, ymm15); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 7 * cs_b)); \
+  ymm7 = _mm256_broadcast_sd((b11 + 2 + 7 * cs_b)); \
+  ymm7 = _mm256_insertf64x2(ymm7, xmm5, 0); \
+  ymm16 = _mm256_fmsub_pd(ymm7, ymm31, ymm16);
+
+#define BLIS_PRE_DTRSM_SMALL_8x2(AlphaVal, b11, cs_b) \
+  /*gemm_output = (B11 * alpha) - gemm_output*/ \
+  ymm31 = _mm256_broadcast_sd(&AlphaVal); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 0 * cs_b)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);\
+  ymm9 = _mm256_fmsub_pd(ymm0, ymm31, ymm9); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 1 * cs_b)); \
+  ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0);\
+  ymm10 = _mm256_fmsub_pd(ymm1, ymm31, ymm10); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 2 * cs_b)); \
+  ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0);\
+  ymm11 = _mm256_fmsub_pd(ymm2, ymm31, ymm11); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 3 * cs_b)); \
+  ymm3 = _mm256_insertf128_pd(ymm3, xmm5, 0);\
+  ymm12 = _mm256_fmsub_pd(ymm3, ymm31, ymm12); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 4 * cs_b)); \
+  ymm4 = _mm256_insertf128_pd(ymm4, xmm5, 0);\
+  ymm13 = _mm256_fmsub_pd(ymm4, ymm31, ymm13); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 5 * cs_b)); \
+  ymm5 = _mm256_insertf128_pd(ymm5, xmm5, 0);\
+  ymm14 = _mm256_fmsub_pd(ymm5, ymm31, ymm14); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 6 * cs_b)); \
+  ymm6 = _mm256_insertf128_pd(ymm6, xmm5, 0);\
+  ymm15 = _mm256_fmsub_pd(ymm6, ymm31, ymm15); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + 7 * cs_b)); \
+  ymm7 = _mm256_insertf128_pd(ymm7, xmm5, 0);\
+  ymm16 = _mm256_fmsub_pd(ymm7, ymm31, ymm16);
+
+#define BLIS_PRE_DTRSM_SMALL_8x1(AlphaVal, b11, cs_b) \
+  /*gemm_output = (B11 * alpha) - gemm_output*/ \
+  ymm31 = _mm256_broadcast_sd(&AlphaVal); \
+  \
+  xmm5 = _mm_loadl_pd(xmm5, (double const *)(b11 + 0 * cs_b)); \
+  ymm0 = _mm256_insertf64x2(ymm0, xmm5, 0); \
+  ymm9 = _mm256_fmsub_pd(ymm0, ymm31, ymm9); \
+  \
+  xmm5 = _mm_loadl_pd(xmm5, (double const *)(b11 + 1 * cs_b)); \
+  ymm1 = _mm256_insertf64x2(ymm1, xmm5, 0); \
+  ymm10 = _mm256_fmsub_pd(ymm1, ymm31, ymm10); \
+  \
+  xmm5 = _mm_loadl_pd(xmm5, (double const *)(b11 + 2 * cs_b)); \
+  ymm2 = _mm256_insertf64x2(ymm2, xmm5, 0); \
+  ymm11 = _mm256_fmsub_pd(ymm2, ymm31, ymm11); \
+  \
+  xmm5 = _mm_loadl_pd(xmm5, (double const *)(b11 + 3 * cs_b)); \
+  ymm3 = _mm256_insertf64x2(ymm3, xmm5, 0); \
+  ymm12 = _mm256_fmsub_pd(ymm3, ymm31, ymm12); \
+  \
+  xmm5 = _mm_loadl_pd(xmm5, (double const *)(b11 + 4 * cs_b)); \
+  ymm4 = _mm256_insertf64x2(ymm4, xmm5, 0); \
+  ymm13 = _mm256_fmsub_pd(ymm4, ymm31, ymm13); \
+  \
+  xmm5 = _mm_loadl_pd(xmm5, (double const *)(b11 + 5 * cs_b)); \
+  ymm5 = _mm256_insertf64x2(ymm5, xmm5, 0); \
+  ymm14 = _mm256_fmsub_pd(ymm5, ymm31, ymm14); \
+  \
+  xmm5 = _mm_loadl_pd(xmm5, (double const *)(b11 + 6 * cs_b)); \
+  ymm6 = _mm256_insertf64x2(ymm6, xmm5, 0); \
+  ymm15 = _mm256_fmsub_pd(ymm6, ymm31, ymm15); \
+  \
+  xmm5 = _mm_loadl_pd(xmm5, (double const *)(b11 + 7 * cs_b)); \
+  ymm7 = _mm256_insertf64x2(ymm7, xmm5, 0); \
+  ymm16 = _mm256_fmsub_pd(ymm7, ymm31, ymm16);
+
+#define BLIS_PRE_DTRSM_SMALL_4x8(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)(&AlphaVal)); \
+  \
+  ymm0 = _mm256_loadu_pd((double const *)b11); \
+  ymm1 = _mm256_loadu_pd((double const *)(b11 + 4)); \
+  \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3); \
+  ymm4 = _mm256_fmsub_pd(ymm1, ymm15, ymm4); \
+  \
+  ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b)); \
+  ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b + 4)); \
+  \
+  ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5); \
+  ymm6 = _mm256_fmsub_pd(ymm1, ymm15, ymm6); \
+  \
+  ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); \
+  ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2 + 4)); \
+  \
+  ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7); \
+  ymm8 = _mm256_fmsub_pd(ymm1, ymm15, ymm8); \
+  \
+  ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3)); \
+  ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3 + 4)); \
+  \
+  ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9); \
+  ymm10 = _mm256_fmsub_pd(ymm1, ymm15, ymm10);
+
+#define BLIS_PRE_DTRSM_SMALL_3N_3M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 2 + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+
+#define BLIS_POST_DTRSM_SMALL_3N_3M(b11, cs_b) \
+ \
+  xmm5 = _mm256_castpd256_pd128(ymm3); \
+  _mm_storeu_pd((double *)(b11), xmm5); \
+  _mm_storel_pd((b11 + 2), _mm256_extractf128_pd(ymm3, 1)); \
+  xmm5 = _mm256_castpd256_pd128(ymm5); \
+  _mm_storeu_pd((double *)(b11 + cs_b), xmm5); \
+  _mm_storel_pd((b11 + cs_b + 2), _mm256_extractf128_pd(ymm5, 1)); \
+  xmm5 = _mm256_castpd256_pd128(ymm7); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 2), xmm5); \
+  _mm_storel_pd((b11 + cs_b * 2 + 2), _mm256_extractf128_pd(ymm7, 1));
+
+#define BLIS_PRE_DTRSM_SMALL_3N_2M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+
+#define BLIS_POST_DTRSM_SMALL_3N_2M(b11, cs_b) \
+ \
+  xmm5 = _mm256_castpd256_pd128(ymm3); \
+  _mm_storeu_pd((double *)(b11), xmm5); \
+  xmm5 = _mm256_castpd256_pd128(ymm5); \
+  _mm_storeu_pd((double *)(b11 + cs_b), xmm5); \
+  xmm5 = _mm256_castpd256_pd128(ymm7); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 2), xmm5);
+
+#define BLIS_PRE_DTRSM_SMALL_3N_1M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)b11); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3); \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b)); \
+  ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5); \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 2)); \
+  ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+
+#define BLIS_POST_DTRSM_SMALL_3N_1M(b11, cs_b) \
+ \
+  _mm_storel_pd((b11 + cs_b * 0), _mm256_castpd256_pd128(ymm3)); \
+  _mm_storel_pd((b11 + cs_b * 1), _mm256_castpd256_pd128(ymm5)); \
+  _mm_storel_pd((b11 + cs_b * 2), _mm256_castpd256_pd128(ymm7));
+
+#define BLIS_PRE_DTRSM_SMALL_2N_3M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 1 + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+
+#define BLIS_POST_DTRSM_SMALL_2N_3M(b11, cs_b) \
+ \
+  xmm5 = _mm256_castpd256_pd128(ymm3); \
+  _mm_storeu_pd((double *)(b11), xmm5); \
+  _mm_storel_pd((b11 + 2), _mm256_extractf128_pd(ymm3, 1)); \
+  xmm5 = _mm256_castpd256_pd128(ymm5); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 1), xmm5); \
+  _mm_storel_pd((b11 + cs_b * 1 + 2), _mm256_extractf128_pd(ymm5, 1));
+
+#define BLIS_PRE_DTRSM_SMALL_2N_2M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3); \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+
+#define BLIS_POST_DTRSM_SMALL_2N_2M(b11, cs_b) \
+ \
+  xmm5 = _mm256_castpd256_pd128(ymm3); \
+  _mm_storeu_pd((double *)(b11), xmm5); \
+  xmm5 = _mm256_castpd256_pd128(ymm5); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 1), xmm5);
+
+#define BLIS_PRE_DTRSM_SMALL_2N_1M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)b11); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3); \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b)); \
+  ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+
+#define BLIS_POST_DTRSM_SMALL_2N_1M(b11, cs_b) \
+ \
+  _mm_storel_pd(b11, _mm256_castpd256_pd128(ymm3)); \
+  _mm_storel_pd((b11 + cs_b * 1), _mm256_castpd256_pd128(ymm5));
+
+#define BLIS_PRE_DTRSM_SMALL_1N_3M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+
+#define BLIS_POST_DTRSM_SMALL_1N_3M(b11, cs_b) \
+  xmm5 = _mm256_castpd256_pd128(ymm3); \
+  _mm_storeu_pd((double *)(b11), xmm5); \
+  _mm_storel_pd((b11 + 2), _mm256_extractf128_pd(ymm3, 1));
+
+#define BLIS_PRE_DTRSM_SMALL_1N_2M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+
+#define BLIS_POST_DTRSM_SMALL_1N_2M(b11, cs_b) \
+ \
+  xmm5 = _mm256_castpd256_pd128(ymm3); \
+  _mm_storeu_pd((double *)(b11), xmm5);
+
+#define BLIS_PRE_DTRSM_SMALL_1N_1M(AlphaVal, b11, cs_b) \
+  ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); /*register to hold alpha*/ \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)b11); \
+  ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+
+#define BLIS_POST_DTRSM_SMALL_1N_1M(b11, cs_b) \
+ \
+  _mm_storel_pd(b11, _mm256_castpd256_pd128(ymm3));
+
+// endregion - pre/post DTRSM macros for right variants
+
+// RUNN - RLTN
+BLIS_INLINE err_t bli_dtrsm_small_XAltB_XAuB_AVX512
+     (
+       obj_t*   AlphaObj,
+       obj_t*   a,
+       obj_t*   b,
+       cntx_t*  cntx,
+       cntl_t*  cntl
+     )
+{
+  dim_t m = bli_obj_length(b); //number of rows
+  dim_t n = bli_obj_width(b); // number of columns
+  dim_t d_mr = 8, d_nr = 8;
+
+  bool transa = bli_obj_has_trans(a);
+  dim_t cs_a, rs_a;
+  double ones = 1.0;
+
+  // Swap rs_a & cs_a in case of non-transpose.
+  if (transa)
+  {
+    cs_a = bli_obj_col_stride(a); // column stride of A
+    rs_a = bli_obj_row_stride(a); // row stride of A
+  }
+  else
+  {
+    cs_a = bli_obj_row_stride(a); // row stride of A
+    rs_a = bli_obj_col_stride(a); // column stride of A
+  }
+
+  dim_t cs_b = bli_obj_col_stride(b); // column stride of B
+
+  dim_t i, j, k;
+  dim_t k_iter;
+
+  bool is_unitdiag = bli_obj_has_unit_diag(a);
+
+  double AlphaVal = *(double *)AlphaObj->buffer;
+  double *restrict L = a->buffer; // pointer to matrix A
+  double *B = bli_obj_buffer_at_off(b); // pointer to matrix B
+
+  double *a01, *a11, *b10, *b11; // pointers for GEMM and TRSM blocks
+
+  gint_t required_packing_A = 1;
+  mem_t local_mem_buf_A_s = {0};
+  double *D_A_pack = NULL; // pointer to A01 pack buffer
+  double d11_pack[d_mr] __attribute__((aligned(64))); // buffer for diagonal A pack
+  rntm_t rntm;
+
+  bli_rntm_init_from_global(&rntm);
+  bli_rntm_set_num_threads_only(1, &rntm);
+  bli_membrk_rntm_set_membrk(&rntm);
+
+  siz_t buffer_size = bli_pool_block_size(
+    bli_membrk_pool(
+      bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK),
+      bli_rntm_membrk(&rntm)));
+
+  if ((d_nr * n * sizeof(double)) > buffer_size)
+    return BLIS_NOT_YET_IMPLEMENTED;
+
+  if (required_packing_A == 1)
+  {
+    // Get the buffer from the pool.
+    bli_membrk_acquire_m(&rntm,
+               buffer_size,
+               BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
+               &local_mem_buf_A_s); // acquire memory for A01 pack
+    if (FALSE == bli_mem_is_alloc(&local_mem_buf_A_s))
+      return BLIS_NULL_POINTER;
+    D_A_pack = bli_mem_buffer(&local_mem_buf_A_s);
+    if (NULL == D_A_pack)
+      return BLIS_NULL_POINTER;
+  }
+
+  __m512d zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11;
+  __m512d zmm12, zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, zmm21;
+  __m512d zmm22, zmm23, zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
+  __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11;
+  __m256d ymm12, ymm13, ymm14, ymm15, ymm16, ymm17, ymm18, ymm19, ymm20, ymm21;
+  __m256d ymm22, ymm23, ymm24, ymm25, ymm26, ymm27, ymm28, ymm29, ymm30, ymm31;
+  __m128d xmm5, xmm0;
+
+  //gcc12 throws a unitialized warning,
+  //To avoid that these variable are sect to zero.
+  ymm0 = _mm256_setzero_pd();
+  xmm5 = _mm_setzero_pd();
+  /*
+    Performs solving TRSM for 8 rows at a time from  0 to n/8 in steps of d_nr
+    a. Load and pack A (a01 block), the size of packing 8x8 to 8x(n-8)
+        First there will be no GEMM and no packing of a01 because it is only TRSM
+    b. Using packed a01 block and b10 block perform GEMM operation
+    c. Use GEMM outputs, perform TRSM operation using a11, b11 and update B
+    d. Repeat b for m cols of B in steps of d_mr
+  */
+  for (j = 0; (j + d_nr - 1) < n; j += d_nr) //loop along 'N' direction
+  {
+    a01 = L + j * rs_a;            //pointer to block of A to be used in GEMM
+    a11 = L + j * cs_a + j * rs_a; //pointer to block of A to be used for TRSM
+
+    dim_t p_lda = j;               //packed leading dimension
+
+    // perform copy of A to packed buffer D_A_pack
+    if (transa)
+    {
+      /*
+      Pack current A block (a01) into packed buffer memory D_A_pack
+        a. This a10 block is used in GEMM portion only and this
+            a01 block size will be increasing by d_nr for every next iteration
+            until it reaches 8x(n-8) which is the maximum GEMM alone block size in A
+        b. This packed buffer is reused to calculate all m cols of B matrix
+      */
+      bli_dtrsm_small_pack_avx512('R', j, 1, a01, cs_a, D_A_pack, p_lda, d_nr);
+      /*
+        Pack 8 diagonal elements of A block into an array
+        a. This helps to utilize cache line efficiently in TRSM operation
+        b. store ones when input is unit diagonal
+      */
+      dtrsm_small_pack_diag_element_avx512(is_unitdiag, a11, cs_a, d11_pack, d_nr);
+    }
+    else
+    {
+      bli_dtrsm_small_pack_avx512('R', j, 0, a01, rs_a, D_A_pack, p_lda, d_nr);
+      dtrsm_small_pack_diag_element_avx512(is_unitdiag, a11, rs_a, d11_pack, d_nr);
+    }
+
+    /*
+      a. Perform GEMM using a01, b10.
+      b. Perform TRSM on a11, b11
+      c. This loop GEMM+TRSM loops operates with 8x6 block size
+          along m dimension for every d_mr columns of B10 where
+          packed A buffer is reused in computing all m cols of B.
+      d. Same approach is used in remaining fringe cases.
+    */
+    for (i = 0; (i + d_mr - 1) < m; i += d_mr) //loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a;   //pointer to block of A to be used for TRSM
+      b10 = B + i;                     //pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;          //pointer to block of B to be used for TRSM
+
+      k_iter = j;
+      BLIS_SET_ZMM_REG_ZEROS
+      /*
+        Perform GEMM between a01 and b10 blocks
+        For first iteration there will be no GEMM operation
+        where k_iter are zero
+      */
+      BLIS_DTRSM_SMALL_GEMM_8nx8m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11);
+      /*
+        Load b11 of size 8x8 and multiply with alpha
+        Add the GEMM output to b11
+        and perform TRSM operation.
+      */
+      BLIS_PRE_DTRSM_SMALL_8x8(AlphaVal, b11, cs_b)
+
+
+
+      /*
+        Compute 8x8 TRSM block by using GEMM block output in register
+        a. The 8x8 input (gemm outputs) are stored in combinations of zmm registers
+            row      :   0     1    2      3     4     5     6     7
+            register : zmm9  zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16
+        b. Towards the end TRSM output will be stored back into b11
+      */
+
+      /*
+      *                                        to n-1
+      *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[i][N]  } /A11[N][N]
+      *                                       from i=0
+      *
+      *  For example 5th column (B11[5]) -= ((B11[0] * A11[0][5]) + (B11[1] * A11[2][5]) +
+      *                                      (B11[2] * A11[2][5]) + (B11[3] * A11[3][5]) +
+      *                                      (B11[4] * A11[4][5])) / A11[5][5]
+      *                          zmm14   -= ((zmm9   * A11[0][5]) + (zmm10  * A11[2][5]) +
+      *                                      (zmm11  * A11[2][5]) + (zmm12  * A11[3][5]) +
+      *                                      (zmm13  * A11[4][5])) / A11[5][5]
+      */
+
+
+      // extract a00
+      zmm0 = _mm512_set1_pd(*(d11_pack + 0));
+      zmm9 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm9, zmm0); // zmm9 /= zmm0
+      _mm512_storeu_pd((double *)(b11 + (0 * cs_b)), zmm9);
+
+      // extract a11
+      zmm1 = _mm512_set1_pd(*(d11_pack + 1));
+      zmm2 = _mm512_set1_pd(*(a11 + (1 * rs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm2, zmm9, zmm10);
+      zmm3 = _mm512_set1_pd(*(a11 + (2 * rs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm3, zmm9, zmm11);
+      zmm4 = _mm512_set1_pd(*(a11 + (3 * rs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm4, zmm9, zmm12);
+      zmm5 = _mm512_set1_pd(*(a11 + (4 * rs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm5, zmm9, zmm13);
+      zmm6 = _mm512_set1_pd(*(a11 + (5 * rs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm6, zmm9, zmm14); // zmm14 -= A11[0][5] * zmm9
+      zmm7 = _mm512_set1_pd(*(a11 + (6 * rs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm7, zmm9, zmm15);
+      zmm8 = _mm512_set1_pd(*(a11 + (7 * rs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm8, zmm9, zmm16);
+      zmm10 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm10, zmm1);
+      a11 += cs_a;
+      _mm512_storeu_pd((double *)(b11 + (1 * cs_b)), zmm10);
+
+      // extract a22
+      zmm0 = _mm512_set1_pd(*(d11_pack + 2));
+      zmm2 = _mm512_set1_pd(*(a11 + (2 * rs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm2, zmm10, zmm11);
+      zmm3 = _mm512_set1_pd(*(a11 + (3 * rs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm3, zmm10, zmm12);
+      zmm4 = _mm512_set1_pd(*(a11 + (4 * rs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm4, zmm10, zmm13);
+      zmm5 = _mm512_set1_pd(*(a11 + (5 * rs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm5, zmm10, zmm14); // zmm14 -= A11[1][5] * zmm10
+      zmm6 = _mm512_set1_pd(*(a11 + (6 * rs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm6, zmm10, zmm15);
+      zmm7 = _mm512_set1_pd(*(a11 + (7 * rs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm7, zmm10, zmm16);
+      zmm11 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm11, zmm0);
+      a11 += cs_a;
+      _mm512_storeu_pd((double *)(b11 + (2 * cs_b)), zmm11);
+
+      // extract a33
+      zmm1 = _mm512_set1_pd(*(d11_pack + 3));
+      zmm2 = _mm512_set1_pd(*(a11 + (3 * rs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm2, zmm11, zmm12);
+      zmm3 = _mm512_set1_pd(*(a11 + (4 * rs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm3, zmm11, zmm13);
+      zmm4 = _mm512_set1_pd(*(a11 + (5 * rs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm4, zmm11, zmm14); // zmm14 -= A11[2][5] * zmm11
+      zmm5 = _mm512_set1_pd(*(a11 + (6 * rs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm5, zmm11, zmm15);
+      zmm6 = _mm512_set1_pd(*(a11 + (7 * rs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm6, zmm11, zmm16);
+      zmm12 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm12, zmm1);
+      a11 += cs_a;
+      _mm512_storeu_pd((double *)(b11 + (3 * cs_b)), zmm12);
+
+      // extract a44
+      zmm0 = _mm512_set1_pd(*(d11_pack + 4));
+      zmm2 = _mm512_set1_pd(*(a11 + (4 * rs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm2, zmm12, zmm13);
+      zmm3 = _mm512_set1_pd(*(a11 + (5 * rs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm3, zmm12, zmm14); // zmm14 -= A11[3][5] * zmm12
+      zmm4 = _mm512_set1_pd(*(a11 + (6 * rs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm4, zmm12, zmm15);
+      zmm5 = _mm512_set1_pd(*(a11 + (7 * rs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm5, zmm12, zmm16);
+      zmm13 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm13, zmm0);
+      a11 += cs_a;
+      _mm512_storeu_pd((double *)(b11 + (4 * cs_b)), zmm13);
+
+      // extract a55
+      zmm1 = _mm512_set1_pd(*(d11_pack + 5));
+      zmm2 = _mm512_set1_pd(*(a11 + (5 * rs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm2, zmm13, zmm14); // zmm14 -= A11[4][5] * zmm13
+      zmm3 = _mm512_set1_pd(*(a11 + (6 * rs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm3, zmm13, zmm15);
+      zmm4 = _mm512_set1_pd(*(a11 + (7 * rs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm4, zmm13, zmm16);
+      zmm14 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm14, zmm1); // zmm14 /= A11[5][5]
+      a11 += cs_a;
+      _mm512_storeu_pd((double *)(b11 + (5 * cs_b)), zmm14);
+
+      // extract a66
+      zmm0 = _mm512_set1_pd(*(d11_pack + 6));
+      zmm2 = _mm512_set1_pd(*(a11 + (6 * rs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm2, zmm14, zmm15);
+      zmm3 = _mm512_set1_pd(*(a11 + (7 * rs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm3, zmm14, zmm16);
+      zmm15 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm15, zmm0);
+      a11 += cs_a;
+      _mm512_storeu_pd((double *)(b11 + (6 * cs_b)), zmm15);
+
+      // extract a77
+      zmm1 = _mm512_set1_pd(*(d11_pack + 7));
+      zmm2 = _mm512_set1_pd(*(a11 + (7 * rs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm2, zmm15, zmm16);
+      zmm16 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm16, zmm1);
+      _mm512_storeu_pd((double *)(b11 + (7 * cs_b)), zmm16);
+    }
+    dim_t m_remainder = m - i;
+    if(m_remainder)
+    {
+      if (m_remainder >= 4) //loop along 'M' direction
+      {
+        a01 = D_A_pack;
+        a11 = L + j * cs_a + j * rs_a;   //pointer to block of A to be used for TRSM
+        b10 = B + i;                     //pointer to block of B to be used in GEMM
+        b11 = B + i + j * cs_b;          //pointer to block of B to be used for TRSM
+
+        k_iter = j;
+        BLIS_SET_YMM_REG_ZEROS_AVX512
+        /*
+          Perform GEMM between a01 and b10 blocks
+          For first iteration there will be no GEMM operation
+          where k_iter are zero
+        */
+        BLIS_DTRSM_SMALL_GEMM_8nx4m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11)
+        /*
+          Load b11 of size 8x4 and multiply with alpha
+          Add the GEMM output to b11
+          and perform TRSM operation.
+        */
+        BLIS_PRE_DTRSM_SMALL_8x4(AlphaVal, b11, cs_b)
+
+        /*
+          Compute 8x4 TRSM block by using GEMM block output in register
+          a. The 8x4 input (gemm outputs) are stored in combinations of ymm registers
+              row      :   0     1    2      3     4     5     6     7
+              register : ymm9  ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 ymm16
+          b. Towards the end TRSM output will be stored back into b11
+        */
+
+        /*
+        *                                        to n-1
+        *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[i][N]  } /A11[N][N]
+        *                                       from i=0
+        *
+        *  For example 5th column (B11[5]) -= ((B11[0] * A11[0][5]) + (B11[1] * A11[2][5]) +
+        *                                      (B11[2] * A11[2][5]) + (B11[3] * A11[3][5]) +
+        *                                      (B11[4] * A11[4][5])) / A11[5][5]
+        */
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((d11_pack + 0));
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+        _mm256_storeu_pd((double *)(b11 + (0 * cs_b)),  ymm9);
+
+        // extract a11
+        ymm1 = _mm256_broadcast_sd((d11_pack + 1));
+        ymm2 = _mm256_broadcast_sd((a11 + (1 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm2, ymm9, ymm10);
+        ymm3 = _mm256_broadcast_sd((a11 + (2 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm3, ymm9, ymm11);
+        ymm4 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm4, ymm9, ymm12);
+        ymm5 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm5, ymm9, ymm13);
+        ymm6 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm6, ymm9, ymm14);
+        ymm7 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm7, ymm9, ymm15);
+        ymm8 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm8, ymm9, ymm16);
+        ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+        a11 += cs_a;
+        _mm256_storeu_pd((double *)(b11 + (1 * cs_b)), ymm10);
+
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((d11_pack + 2));
+        ymm2 = _mm256_broadcast_sd((a11 + (2 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm2, ymm10, ymm11);
+        ymm3 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm3, ymm10, ymm12);
+        ymm4 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm4, ymm10, ymm13);
+        ymm5 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm5, ymm10, ymm14);
+        ymm6 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm6, ymm10, ymm15);
+        ymm7 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm7, ymm10, ymm16);
+        ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm0);
+        a11 += cs_a;
+        _mm256_storeu_pd((double *)(b11 + (2 * cs_b)), ymm11);
+
+        // extract a33
+        ymm1 = _mm256_broadcast_sd((d11_pack + 3));
+        ymm2 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm2, ymm11, ymm12);
+        ymm3 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm3, ymm11, ymm13);
+        ymm4 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm4, ymm11, ymm14);
+        ymm5 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm5, ymm11, ymm15);
+        ymm6 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm6, ymm11, ymm16);
+        ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+        a11 += cs_a;
+        _mm256_storeu_pd((double *)(b11 + (3 * cs_b)), ymm12);
+
+        // extract a44
+        ymm0 = _mm256_broadcast_sd((d11_pack + 4));
+        ymm2 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13);
+        ymm3 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14);
+        ymm4 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm4, ymm12, ymm15);
+        ymm5 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm5, ymm12, ymm16);
+        ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm0);
+        a11 += cs_a;
+        _mm256_storeu_pd((double *)(b11 + (4 * cs_b)), ymm13);
+
+        // extract a55
+        ymm1 = _mm256_broadcast_sd((d11_pack + 5));
+        ymm2 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm2, ymm13, ymm14);
+        ymm3 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm3, ymm13, ymm15);
+        ymm4 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm4, ymm13, ymm16);
+        ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+        a11 += cs_a;
+        _mm256_storeu_pd((double *)(b11 + (5 * cs_b)), ymm14);
+
+        // extract a66
+        ymm0 = _mm256_broadcast_sd((d11_pack + 6));
+        ymm2 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm2, ymm14, ymm15);
+        ymm3 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm3, ymm14, ymm16);
+        ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm0);
+        a11 += cs_a;
+        _mm256_storeu_pd((double *)(b11 + (6 * cs_b)), ymm15);
+
+        // extract a77
+        ymm1 = _mm256_broadcast_sd((d11_pack + 7));
+        ymm2 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm2, ymm15, ymm16);
+        ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm1);
+        _mm256_storeu_pd((double *)(b11 + (7 * cs_b)), ymm16);
+        m_remainder -= 4;
+        i += 4;
+      }
+      if (m_remainder == 3) //loop along 'M' direction
+      {
+        a01 = D_A_pack;
+        a11 = L + j * cs_a + j * rs_a;   //pointer to block of A to be used for TRSM
+        b10 = B + i;                     //pointer to block of B to be used in GEMM
+        b11 = B + i + j * cs_b;          //pointer to block of B to be used for TRSM
+
+        k_iter = j;
+        BLIS_SET_YMM_REG_ZEROS_AVX512
+        /*
+          Perform GEMM between a01 and b10 blocks
+          For first iteration there will be no GEMM operation
+          where k_iter are zero
+        */
+        BLIS_DTRSM_SMALL_GEMM_8nx3m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11)
+        /*
+          Load b11 of size 8x3 and multiply with alpha
+          Add the GEMM output to b11
+          and perform TRSM operation.
+        */
+        BLIS_PRE_DTRSM_SMALL_8x3(AlphaVal, b11, cs_b)
+        /*
+          Compute 8x3 TRSM block by using GEMM block output in register
+          a. The 8x3 input (gemm outputs) are stored in combinations of ymm registers
+              row      :   0     1    2      3     4     5     6     7
+              register : ymm9  ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 ymm16
+          b. Towards the end TRSM output will be stored back into b11
+        */
+
+        /*
+        *                                        to n-1
+        *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[i][N]  } /A11[N][N]
+        *                                       from i=0
+        *
+        *  For example 5th column (B11[5]) -= ((B11[0] * A11[0][5]) + (B11[1] * A11[2][5]) +
+        *                                      (B11[2] * A11[2][5]) + (B11[3] * A11[3][5]) +
+        *                                      (B11[4] * A11[4][5])) / A11[5][5]
+        */
+
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((d11_pack + 0));
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+        _mm_storeu_pd((double *)(b11 + (0 * cs_b)), _mm256_castpd256_pd128(ymm9));
+        _mm_storel_pd((double *)(b11 + (0 * cs_b) + 2), _mm256_extractf64x2_pd(ymm9, 1));
+
+        // extract a11
+        ymm1 = _mm256_broadcast_sd((d11_pack + 1));
+        ymm2 = _mm256_broadcast_sd((a11 + (1 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm2, ymm9, ymm10);
+        ymm3 = _mm256_broadcast_sd((a11 + (2 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm3, ymm9, ymm11);
+        ymm4 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm4, ymm9, ymm12);
+        ymm5 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm5, ymm9, ymm13);
+        ymm6 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm6, ymm9, ymm14);
+        ymm7 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm7, ymm9, ymm15);
+        ymm8 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm8, ymm9, ymm16);
+        ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (1 * cs_b)), _mm256_castpd256_pd128(ymm10));
+        _mm_storel_pd((double *)(b11 + (1 * cs_b) + 2), _mm256_extractf64x2_pd(ymm10, 1));
+
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((d11_pack + 2));
+        ymm2 = _mm256_broadcast_sd((a11 + (2 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm2, ymm10, ymm11);
+        ymm3 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm3, ymm10, ymm12);
+        ymm4 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm4, ymm10, ymm13);
+        ymm5 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm5, ymm10, ymm14);
+        ymm6 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm6, ymm10, ymm15);
+        ymm7 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm7, ymm10, ymm16);
+        ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm0);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (2 * cs_b)), _mm256_castpd256_pd128(ymm11));
+        _mm_storel_pd((double *)(b11 + (2 * cs_b) + 2), _mm256_extractf64x2_pd(ymm11, 1));
+
+        // extract a33
+        ymm1 = _mm256_broadcast_sd((d11_pack + 3));
+        ymm2 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm2, ymm11, ymm12);
+        ymm3 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm3, ymm11, ymm13);
+        ymm4 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm4, ymm11, ymm14);
+        ymm5 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm5, ymm11, ymm15);
+        ymm6 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm6, ymm11, ymm16);
+        ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (3 * cs_b)), _mm256_castpd256_pd128(ymm12));
+        _mm_storel_pd((double *)(b11 + (3 * cs_b) + 2), _mm256_extractf64x2_pd(ymm12, 1));
+
+        // extract a44
+        ymm0 = _mm256_broadcast_sd((d11_pack + 4));
+        ymm2 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13);
+        ymm3 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14);
+        ymm4 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm4, ymm12, ymm15);
+        ymm5 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm5, ymm12, ymm16);
+        ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm0);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (4 * cs_b)), _mm256_castpd256_pd128(ymm13));
+        _mm_storel_pd((double *)(b11 + (4 * cs_b) + 2), _mm256_extractf64x2_pd(ymm13, 1));
+
+        // extract a55
+        ymm1 = _mm256_broadcast_sd((d11_pack + 5));
+        ymm2 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm2, ymm13, ymm14);
+        ymm3 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm3, ymm13, ymm15);
+        ymm4 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm4, ymm13, ymm16);
+        ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (5 * cs_b)), _mm256_castpd256_pd128(ymm14));
+        _mm_storel_pd((double *)(b11 + (5 * cs_b) + 2), _mm256_extractf64x2_pd(ymm14, 1));
+
+        // extract a66
+        ymm0 = _mm256_broadcast_sd((d11_pack + 6));
+        ymm2 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm2, ymm14, ymm15);
+        ymm3 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm3, ymm14, ymm16);
+        ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm0);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (6 * cs_b)), _mm256_castpd256_pd128(ymm15));
+        _mm_storel_pd((double *)(b11 + (6 * cs_b) + 2), _mm256_extractf64x2_pd(ymm15, 1));
+
+        // extract a77
+        ymm1 = _mm256_broadcast_sd((d11_pack + 7));
+        ymm2 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm2, ymm15, ymm16);
+        ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm1);
+        _mm_storeu_pd((double *)(b11 + (7 * cs_b)), _mm256_castpd256_pd128(ymm16));
+        _mm_storel_pd((double *)(b11 + (7 * cs_b) + 2), _mm256_extractf64x2_pd(ymm16, 1));
+        m_remainder -= 3;
+        i += 3;
+      }
+      else if (m_remainder == 2)
+      {
+        a01 = D_A_pack;
+        a11 = L + j * cs_a + j * rs_a;
+        b10 = B + i;
+        b11 = B + i + j * cs_b;
+
+        k_iter = j;
+        BLIS_SET_YMM_REG_ZEROS_AVX512
+        BLIS_DTRSM_SMALL_GEMM_8nx2m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11)
+        BLIS_PRE_DTRSM_SMALL_8x2(AlphaVal, b11, cs_b)
+        /*
+          Compute 8x2 TRSM block by using GEMM block output in register
+          a. The 8x2 input (gemm outputs) are stored in combinations of zmm registers
+              row      :   0     1    2      3     4     5     6     7
+              register : ymm9  ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 ymm16
+          b. Towards the end TRSM output will be stored back into b11
+        */
+
+        /*
+        *                                        to n-1
+        *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[i][N]  } /A11[N][N]
+        *                                       from i=0
+        *
+        *  For example 5th column (B11[5]) -= ((B11[0] * A11[0][5]) + (B11[1] * A11[2][5]) +
+        *                                      (B11[2] * A11[2][5]) + (B11[3] * A11[3][5]) +
+        *                                      (B11[4] * A11[4][5])) / A11[5][5]
+        */
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((d11_pack + 0));
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+        _mm_storeu_pd((double *)(b11 + (0 * cs_b)), _mm256_castpd256_pd128(ymm9));
+
+        // extract a11
+        ymm1 = _mm256_broadcast_sd((d11_pack + 1));
+        ymm2 = _mm256_broadcast_sd((a11 + (1 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm2, ymm9, ymm10);
+        ymm3 = _mm256_broadcast_sd((a11 + (2 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm3, ymm9, ymm11);
+        ymm4 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm4, ymm9, ymm12);
+        ymm5 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm5, ymm9, ymm13);
+        ymm6 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm6, ymm9, ymm14);
+        ymm7 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm7, ymm9, ymm15);
+        ymm8 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm8, ymm9, ymm16);
+        ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (1 * cs_b)), _mm256_castpd256_pd128(ymm10));
+
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((d11_pack + 2));
+        ymm2 = _mm256_broadcast_sd((a11 + (2 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm2, ymm10, ymm11);
+        ymm3 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm3, ymm10, ymm12);
+        ymm4 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm4, ymm10, ymm13);
+        ymm5 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm5, ymm10, ymm14);
+        ymm6 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm6, ymm10, ymm15);
+        ymm7 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm7, ymm10, ymm16);
+        ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm0);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (2 * cs_b)), _mm256_castpd256_pd128(ymm11));
+
+        // extract a33
+        ymm1 = _mm256_broadcast_sd((d11_pack + 3));
+        ymm2 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm2, ymm11, ymm12);
+        ymm3 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm3, ymm11, ymm13);
+        ymm4 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm4, ymm11, ymm14);
+        ymm5 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm5, ymm11, ymm15);
+        ymm6 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm6, ymm11, ymm16);
+        ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (3 * cs_b)), _mm256_castpd256_pd128(ymm12));
+
+        // extract a44
+        ymm0 = _mm256_broadcast_sd((d11_pack + 4));
+        ymm2 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13);
+        ymm3 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14);
+        ymm4 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm4, ymm12, ymm15);
+        ymm5 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm5, ymm12, ymm16);
+        ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm0);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (4 * cs_b)), _mm256_castpd256_pd128(ymm13));
+
+        // extract a55
+        ymm1 = _mm256_broadcast_sd((d11_pack + 5));
+        ymm2 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm2, ymm13, ymm14);
+        ymm3 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm3, ymm13, ymm15);
+        ymm4 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm4, ymm13, ymm16);
+        ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (5 * cs_b)), _mm256_castpd256_pd128(ymm14));
+
+        // extract a66
+        ymm0 = _mm256_broadcast_sd((d11_pack + 6));
+        ymm2 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm2, ymm14, ymm15);
+        ymm3 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm3, ymm14, ymm16);
+        ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm0);
+        a11 += cs_a;
+        _mm_storeu_pd((double *)(b11 + (6 * cs_b)), _mm256_castpd256_pd128(ymm15));
+
+        // extract a77
+        ymm1 = _mm256_broadcast_sd((d11_pack + 7));
+        ymm2 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+        ymm16 = _mm256_fnmadd_pd(ymm2, ymm15, ymm16);
+        ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm1);
+        _mm_storeu_pd((double *)(b11 + (7 * cs_b)), _mm256_castpd256_pd128(ymm16));
+        m_remainder -= 2;
+        i += 2;
+      }
+      else if (m_remainder == 1)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a;
+      b10 = B + i;
+      b11 = B + i + j * cs_b;
+
+      k_iter = j;
+      BLIS_SET_YMM_REG_ZEROS_AVX512
+      BLIS_DTRSM_SMALL_GEMM_8nx1m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11)
+      BLIS_PRE_DTRSM_SMALL_8x1(AlphaVal, b11, cs_b)
+      /*
+        Compute 8x1 TRSM block by using GEMM block output in register
+        a. The 8x1 input (gemm outputs) are stored in combinations of zmm registers
+            row      :   0     1    2      3     4     5     6     7
+            register : ymm9  ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 ymm16
+        b. Towards the end TRSM output will be stored back into b11
+      */
+
+      /*
+      *                                        to n-1
+      *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[i][N]  } /A11[N][N]
+      *                                       from i=0
+      *
+      *  For example 5th column (B11[5]) -= ((B11[0] * A11[0][5]) + (B11[1] * A11[2][5]) +
+      *                                      (B11[2] * A11[2][5]) + (B11[3] * A11[3][5]) +
+      *                                      (B11[4] * A11[4][5])) / A11[5][5]
+      */
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((d11_pack + 0));
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+      _mm_storel_pd((double *)(b11 + (0 * cs_b)), _mm256_castpd256_pd128(ymm9));
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((d11_pack + 1));
+      ymm2 = _mm256_broadcast_sd((a11 + (1 * rs_a)));
+      ymm10 = _mm256_fnmadd_pd(ymm2, ymm9, ymm10);
+      ymm3 = _mm256_broadcast_sd((a11 + (2 * rs_a)));
+      ymm11 = _mm256_fnmadd_pd(ymm3, ymm9, ymm11);
+      ymm4 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+      ymm12 = _mm256_fnmadd_pd(ymm4, ymm9, ymm12);
+      ymm5 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+      ymm13 = _mm256_fnmadd_pd(ymm5, ymm9, ymm13);
+      ymm6 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+      ymm14 = _mm256_fnmadd_pd(ymm6, ymm9, ymm14);
+      ymm7 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+      ymm15 = _mm256_fnmadd_pd(ymm7, ymm9, ymm15);
+      ymm8 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+      ymm16 = _mm256_fnmadd_pd(ymm8, ymm9, ymm16);
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+      a11 += cs_a;
+      _mm_storel_pd((double *)(b11 + (1 * cs_b)), _mm256_castpd256_pd128(ymm10));
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((d11_pack + 2));
+      ymm2 = _mm256_broadcast_sd((a11 + (2 * rs_a)));
+      ymm11 = _mm256_fnmadd_pd(ymm2, ymm10, ymm11);
+      ymm3 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+      ymm12 = _mm256_fnmadd_pd(ymm3, ymm10, ymm12);
+      ymm4 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+      ymm13 = _mm256_fnmadd_pd(ymm4, ymm10, ymm13);
+      ymm5 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+      ymm14 = _mm256_fnmadd_pd(ymm5, ymm10, ymm14);
+      ymm6 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+      ymm15 = _mm256_fnmadd_pd(ymm6, ymm10, ymm15);
+      ymm7 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+      ymm16 = _mm256_fnmadd_pd(ymm7, ymm10, ymm16);
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm0);
+      a11 += cs_a;
+      _mm_storel_pd((double *)(b11 + (2 * cs_b)), _mm256_castpd256_pd128(ymm11));
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((d11_pack + 3));
+      ymm2 = _mm256_broadcast_sd((a11 + (3 * rs_a)));
+      ymm12 = _mm256_fnmadd_pd(ymm2, ymm11, ymm12);
+      ymm3 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+      ymm13 = _mm256_fnmadd_pd(ymm3, ymm11, ymm13);
+      ymm4 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+      ymm14 = _mm256_fnmadd_pd(ymm4, ymm11, ymm14);
+      ymm5 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+      ymm15 = _mm256_fnmadd_pd(ymm5, ymm11, ymm15);
+      ymm6 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+      ymm16 = _mm256_fnmadd_pd(ymm6, ymm11, ymm16);
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+      a11 += cs_a;
+      _mm_storel_pd((double *)(b11 + (3 * cs_b)), _mm256_castpd256_pd128(ymm12));
+
+      // extract a44
+      ymm0 = _mm256_broadcast_sd((d11_pack + 4));
+      ymm2 = _mm256_broadcast_sd((a11 + (4 * rs_a)));
+      ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13);
+      ymm3 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+      ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14);
+      ymm4 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+      ymm15 = _mm256_fnmadd_pd(ymm4, ymm12, ymm15);
+      ymm5 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+      ymm16 = _mm256_fnmadd_pd(ymm5, ymm12, ymm16);
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm0);
+      a11 += cs_a;
+      _mm_storel_pd((double *)(b11 + (4 * cs_b)), _mm256_castpd256_pd128(ymm13));
+
+      // extract a55
+      ymm1 = _mm256_broadcast_sd((d11_pack + 5));
+      ymm2 = _mm256_broadcast_sd((a11 + (5 * rs_a)));
+      ymm14 = _mm256_fnmadd_pd(ymm2, ymm13, ymm14);
+      ymm3 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+      ymm15 = _mm256_fnmadd_pd(ymm3, ymm13, ymm15);
+      ymm4 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+      ymm16 = _mm256_fnmadd_pd(ymm4, ymm13, ymm16);
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+      a11 += cs_a;
+      _mm_storel_pd((double *)(b11 + (5 * cs_b)), _mm256_castpd256_pd128(ymm14));
+
+      // extract a66
+      ymm0 = _mm256_broadcast_sd((d11_pack + 6));
+      ymm2 = _mm256_broadcast_sd((a11 + (6 * rs_a)));
+      ymm15 = _mm256_fnmadd_pd(ymm2, ymm14, ymm15);
+      ymm3 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+      ymm16 = _mm256_fnmadd_pd(ymm3, ymm14, ymm16);
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm0);
+      a11 += cs_a;
+      _mm_storel_pd((double *)(b11 + (6 * cs_b)), _mm256_castpd256_pd128(ymm15));
+
+      // extract a77
+      ymm1 = _mm256_broadcast_sd((d11_pack + 7));
+      ymm2 = _mm256_broadcast_sd((a11 + (7 * rs_a)));
+      ymm16 = _mm256_fnmadd_pd(ymm2, ymm15, ymm16);
+      ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm1);
+      _mm_storel_pd((double *)(b11 + (7 * cs_b)), _mm256_castpd256_pd128(ymm16));
+      m_remainder -= 1;
+      i += 1;
+    }
+    }
+  }
+
+  dim_t n_remainder = n - j;
+
+  if (n_remainder >= 4)
+  {
+    a01 = L + j * rs_a;      // pointer to block of A to be used in GEMM
+    a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+
+    double *ptr_a10_dup = D_A_pack;
+
+    dim_t p_lda = j; // packed leading dimension
+    // perform copy of A to packed buffer D_A_pack
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < p_lda; x += 1)
+      {
+        bli_dcopys(*(a01 + rs_a * 0), *(ptr_a10_dup + (p_lda * 0)));
+        bli_dcopys(*(a01 + rs_a * 1), *(ptr_a10_dup + (p_lda * 1)));
+        bli_dcopys(*(a01 + rs_a * 2), *(ptr_a10_dup + (p_lda * 2)));
+        bli_dcopys(*(a01 + rs_a * 3), *(ptr_a10_dup + (p_lda * 3)));
+        ptr_a10_dup += 1;
+        a01 += cs_a;
+      }
+    }
+    else
+    {
+      dim_t loop_count = p_lda / 4;
+
+      for (dim_t x = 0; x < loop_count; x++)
+      {
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 0) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 1) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 2) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 2) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 3) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 3) + (x * 4)), ymm15);
+      }
+
+      dim_t remainder_loop_count = p_lda - loop_count * 4;
+
+      __m128d xmm0;
+      if (remainder_loop_count != 0)
+      {
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 0) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 1) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 2) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 2) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 3) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 3) + (loop_count * 4)), xmm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      if (transa)
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 1) + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 2) + 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 3) + 3));
+      }
+      else
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 1) + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 2) + 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 3) + 3));
+      }
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (i = 0; (i + d_mr - 1) < m; i += d_mr) // loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_4nx8m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_4x8(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+      ymm4 = DTRSM_SMALL_DIV_OR_SCALE(ymm4, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+      ymm6 = _mm256_fnmadd_pd(ymm1, ymm4, ymm6);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+      ymm8 = _mm256_fnmadd_pd(ymm1, ymm4, ymm8);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm3, ymm9);
+      ymm10 = _mm256_fnmadd_pd(ymm1, ymm4, ymm10);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+      ymm6 = DTRSM_SMALL_DIV_OR_SCALE(ymm6, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+      ymm8 = _mm256_fnmadd_pd(ymm1, ymm6, ymm8);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm5, ymm9);
+      ymm10 = _mm256_fnmadd_pd(ymm1, ymm6, ymm10);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm0);
+
+      a11 += cs_a;
+
+      // extract a33
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(Row 3): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm7, ymm9);
+      ymm10 = _mm256_fnmadd_pd(ymm1, ymm8, ymm10);
+
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4), ymm4);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + (cs_b + 4)), ymm6);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2)), ymm7);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2) + 4), ymm8);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 3)), ymm9);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 3) + 4), ymm10);
+    }
+
+    dim_t m_remainder = m - i;
+    if (m_remainder >= 4)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_4nx4m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 3)));
+      // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+      ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9);
+      // B11[0-3][3] * alpha -= ymm6
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm3, ymm9);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm5, ymm9);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      a11 += cs_a;
+
+      // extract a33
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(Row 3): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm7, ymm9);
+
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2)), ymm7);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 3)), ymm9);
+
+      m_remainder -= 4;
+      i += 4;
+    }
+
+    if (m_remainder == 3)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_4nx3m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+
+      xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 3)));
+      ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 3) + 2));
+      ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);
+      ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9);
+      // B11[0-3][3] * alpha -= ymm6
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm3, ymm9);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm5, ymm9);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      a11 += cs_a;
+
+      // extract a33
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(Row 3): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm7, ymm9);
+
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+
+      _mm_storeu_pd((double *)b11, _mm256_castpd256_pd128(ymm3));
+      _mm_storeu_pd((double *)(b11 + cs_b), _mm256_castpd256_pd128(ymm5));
+      _mm_storeu_pd((double *)(b11 + (cs_b * 2)), _mm256_castpd256_pd128(ymm7));
+      _mm_storeu_pd((double *)(b11 + (cs_b * 3)), _mm256_castpd256_pd128(ymm9));
+
+      _mm_storel_pd((double *)b11 + 2, _mm256_extractf128_pd(ymm3, 1));
+      _mm_storel_pd((double *)(b11 + cs_b + 2), _mm256_extractf128_pd(ymm5, 1));
+      _mm_storel_pd((double *)(b11 + (cs_b * 2) + 2), _mm256_extractf128_pd(ymm7, 1));
+      _mm_storel_pd((double *)(b11 + (cs_b * 3) + 2), _mm256_extractf128_pd(ymm9, 1));
+
+      m_remainder -= 3;
+      i += 3;
+    }
+    else if (m_remainder == 2)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_4nx2m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+
+      xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 3)));
+      ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);
+      ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9);
+      // B11[0-3][3] * alpha -= ymm6
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm3, ymm9);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm5, ymm9);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      a11 += cs_a;
+
+      // extract a33
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(Row 3): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm7, ymm9);
+
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+
+      _mm_storeu_pd((double *)b11, _mm256_castpd256_pd128(ymm3));
+      _mm_storeu_pd((double *)(b11 + cs_b), _mm256_castpd256_pd128(ymm5));
+      _mm_storeu_pd((double *)(b11 + (cs_b * 2)), _mm256_castpd256_pd128(ymm7));
+      _mm_storeu_pd((double *)(b11 + (cs_b * 3)), _mm256_castpd256_pd128(ymm9));
+
+      m_remainder -= 2;
+      i += 2;
+    }
+    else if (m_remainder == 1)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_4nx1m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); // register to hold alpha
+
+      ymm0 = _mm256_broadcast_sd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+
+      ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 3)));
+      // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+      ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9);
+      // B11[0-3][3] * alpha -= ymm6
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm3, ymm9);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm5, ymm9);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      a11 += cs_a;
+
+      // extract a33
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(Row 3): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * rs_a)));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm7, ymm9);
+
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+
+      _mm_storel_pd((b11 + (cs_b * 0)), _mm256_castpd256_pd128(ymm3));
+      _mm_storel_pd((b11 + (cs_b * 1)), _mm256_castpd256_pd128(ymm5));
+      _mm_storel_pd((b11 + (cs_b * 2)), _mm256_castpd256_pd128(ymm7));
+      _mm_storel_pd((b11 + (cs_b * 3)), _mm256_castpd256_pd128(ymm9));
+
+      m_remainder -= 1;
+      i += 1;
+    }
+    j += 4;
+    n_remainder -= 4;
+  }
+
+  if (n_remainder == 3)
+  {
+    a01 = L + j * rs_a;      // pointer to block of A to be used in GEMM
+    a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+
+    double *ptr_a10_dup = D_A_pack;
+
+    dim_t p_lda = j; // packed leading dimension
+    // perform copy of A to packed buffer D_A_pack
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < p_lda; x += 1)
+      {
+        bli_dcopys(*(a01 + rs_a * 0), *(ptr_a10_dup + p_lda * 0));
+        bli_dcopys(*(a01 + rs_a * 1), *(ptr_a10_dup + p_lda * 1));
+        bli_dcopys(*(a01 + rs_a * 2), *(ptr_a10_dup + p_lda * 2));
+        ptr_a10_dup += 1;
+        a01 += cs_a;
+      }
+    }
+    else
+    {
+      dim_t loop_count = p_lda / 4;
+
+      for (dim_t x = 0; x < loop_count; x++)
+      {
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 0) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 1) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 2) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 2) + (x * 4)), ymm15);
+      }
+
+      dim_t remainder_loop_count = p_lda - loop_count * 4;
+
+      __m128d xmm0;
+      if (remainder_loop_count != 0)
+      {
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 0) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 1) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 2) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 2) + (loop_count * 4)), xmm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      if (transa)
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 1) + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 2) + 2));
+      }
+      else
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 1) + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 2) + 2));
+      }
+      ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (i = 0; (i + d_mr - 1) < m; i += d_mr) // loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_3nx8m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + 4));
+      // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+      ymm4 = _mm256_fmsub_pd(ymm1, ymm15, ymm4);
+      // B11[4-7][0] * alpha-= ymm1
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b + 4));
+      // B11[4][1] B11[5][1] B11[6][1] B11[7][1]
+
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+      ymm6 = _mm256_fmsub_pd(ymm1, ymm15, ymm6);
+      // B11[4-7][1] * alpha -= ymm3
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2) + 4));
+      // B11[4][2] B11[5][2] B11[6][2] B11[7][2]
+
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+      ymm8 = _mm256_fmsub_pd(ymm1, ymm15, ymm8);
+      // B11[4-7][2] * alpha -= ymm5
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+      ymm4 = DTRSM_SMALL_DIV_OR_SCALE(ymm4, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+      ymm6 = _mm256_fnmadd_pd(ymm1, ymm4, ymm6);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+      ymm8 = _mm256_fnmadd_pd(ymm1, ymm4, ymm8);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+      ymm6 = DTRSM_SMALL_DIV_OR_SCALE(ymm6, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+      ymm8 = _mm256_fnmadd_pd(ymm1, ymm6, ymm8);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4), ymm4);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + cs_b + 4), ymm6);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2)), ymm7);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2) + 4), ymm8);
+    }
+
+    dim_t m_remainder = m - i;
+    if (m_remainder >= 4)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_3nx4m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+
+      /// implement TRSM///
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2)), ymm7);
+
+      m_remainder -= 4;
+      i += 4;
+    }
+
+    if (m_remainder == 3)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_3nx3m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_3N_3M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_3N_3M(b11, cs_b)
+
+      m_remainder -= 3;
+      i += 3;
+    }
+    else if (m_remainder == 2)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_3nx2m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_3N_2M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_3N_2M(b11, cs_b)
+
+      m_remainder -= 2;
+      i += 2;
+    }
+    else if (m_remainder == 1)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_3nx1m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_3N_1M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm3, ymm7);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      a11 += cs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm5, ymm7);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_3N_1M(b11, cs_b)
+
+      m_remainder -= 1;
+      i += 1;
+    }
+    j += 3;
+    n_remainder -= 3;
+  }
+  else if (n_remainder == 2)
+  {
+    a01 = L + j * rs_a;      // pointer to block of A to be used in GEMM
+    a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+
+    double *ptr_a10_dup = D_A_pack;
+
+    dim_t p_lda = j; // packed leading dimension
+    // perform copy of A to packed buffer D_A_pack
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < p_lda; x += 1)
+      {
+        bli_dcopys(*(a01 + rs_a * 0), *(ptr_a10_dup + (p_lda * 0)));
+        bli_dcopys(*(a01 + rs_a * 1), *(ptr_a10_dup + (p_lda * 1)));
+        ptr_a10_dup += 1;
+        a01 += cs_a;
+      }
+    }
+    else
+    {
+      dim_t loop_count = p_lda / 4;
+
+      for (dim_t x = 0; x < loop_count; x++)
+      {
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 0) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 1) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (x * 4)), ymm15);
+      }
+
+      dim_t remainder_loop_count = p_lda - loop_count * 4;
+
+      __m128d xmm0;
+      if (remainder_loop_count != 0)
+      {
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 0) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 1) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (loop_count * 4)), xmm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      if (transa)
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 1) + 1));
+      }
+      else
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 1) + 1));
+      }
+      ymm2 = _mm256_broadcast_sd((double const *)&ones);
+      ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (i = 0; (i + d_mr - 1) < m; i += d_mr) // loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_2nx8m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + 4));
+      // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+      ymm4 = _mm256_fmsub_pd(ymm1, ymm15, ymm4);
+      // B11[4-7][0] * alpha-= ymm1
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b + 4));
+      // B11[4][1] B11[5][1] B11[6][1] B11[7][1]
+
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+      ymm6 = _mm256_fmsub_pd(ymm1, ymm15, ymm6);
+      // B11[4-7][1] * alpha -= ymm3
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+      ymm4 = DTRSM_SMALL_DIV_OR_SCALE(ymm4, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+      ymm6 = _mm256_fnmadd_pd(ymm1, ymm4, ymm6);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+      ymm6 = DTRSM_SMALL_DIV_OR_SCALE(ymm6, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4), ymm4);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + cs_b + 4), ymm6);
+    }
+
+    dim_t m_remainder = m - i;
+    if (m_remainder >= 4)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+      ymm5 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_2nx4m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);
+      // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+
+      m_remainder -= 4;
+      i += 4;
+    }
+
+    if (m_remainder == 3)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+      ymm5 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_2nx3m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_2N_3M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_2N_3M(b11, cs_b)
+
+      m_remainder -= 3;
+      i += 3;
+    }
+    else if (m_remainder == 2)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+      ymm5 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_2nx2m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_2N_2M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_2N_2M(b11, cs_b)
+
+      m_remainder -= 2;
+      i += 2;
+    }
+    else if (m_remainder == 1)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+      ymm5 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_2nx1m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_2N_1M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 1):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm3, ymm5);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_2N_1M(b11, cs_b)
+
+      m_remainder -= 1;
+      i += 1;
+    }
+    j += 2;
+    n_remainder -= 2;
+  }
+  else if (n_remainder == 1)
+  {
+    a01 = L + j * rs_a;      // pointer to block of A to be used in GEMM
+    a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+
+    double *ptr_a10_dup = D_A_pack;
+
+    dim_t p_lda = j; // packed leading dimension
+    // perform copy of A to packed buffer D_A_pack
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < p_lda; x += 1)
+      {
+        bli_dcopys(*(a01 + rs_a * 0), *(ptr_a10_dup + p_lda * 0));
+        ptr_a10_dup += 1;
+        a01 += cs_a;
+      }
+    }
+    else
+    {
+      dim_t loop_count = p_lda / 4;
+
+      for (dim_t x = 0; x < loop_count; x++)
+      {
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 0) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (x * 4)), ymm15);
+      }
+
+      dim_t remainder_loop_count = p_lda - loop_count * 4;
+
+      __m128d xmm0;
+      if (remainder_loop_count != 0)
+      {
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 0) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (loop_count * 4)), xmm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      // broadcast diagonal elements of A11
+      ymm0 = _mm256_broadcast_sd((double const *)(a11));
+      ymm1 = _mm256_broadcast_sd((double const *)&ones);
+      ymm2 = _mm256_broadcast_sd((double const *)&ones);
+      ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (i = 0; (i + d_mr - 1) < m; i += d_mr) // loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+      ymm4 = _mm256_setzero_pd();
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_1nx8m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + 4));
+      // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+      ymm4 = _mm256_fmsub_pd(ymm1, ymm15, ymm4);
+      // B11[4-7][0] * alpha-= ymm1
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+      ymm4 = DTRSM_SMALL_DIV_OR_SCALE(ymm4, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4), ymm4);
+    }
+
+    dim_t m_remainder = m - i;
+    if (m_remainder >= 4)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_1nx4m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);   // B11[0-3][0] * alpha -= ymm0
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+
+      m_remainder -= 4;
+      i += 4;
+    }
+
+    if (m_remainder == 3)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_1nx3m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_1N_3M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_1N_3M(b11, cs_b)
+
+      m_remainder -= 3;
+      i += 3;
+    }
+    else if (m_remainder == 2)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_1nx2m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_1N_2M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_1N_2M(b11, cs_b)
+
+      m_remainder -= 2;
+      i += 2;
+    }
+    else if (m_remainder == 1)
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i;           // pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = j; // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_1nx1m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_1N_1M(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      BLIS_POST_DTRSM_SMALL_1N_1M(b11, cs_b)
+
+      m_remainder -= 1;
+      i += 1;
+    }
+    j += 1;
+    n_remainder -= 1;
+  }
+
+  if ((required_packing_A == 1) && bli_mem_is_alloc(&local_mem_buf_A_s))
+  {
+    bli_membrk_release(&rntm,
+               &local_mem_buf_A_s);
+  }
+  return BLIS_SUCCESS;
+}
+
+
+// RLNN - RUTN
+BLIS_INLINE err_t bli_dtrsm_small_XAutB_XAlB_AVX512
+     (
+       obj_t*   AlphaObj,
+       obj_t*   a,
+       obj_t*   b,
+       cntx_t*  cntx,
+       cntl_t*  cntl
+     )
+{
+  dim_t m = bli_obj_length(b); // number of rows
+  dim_t n = bli_obj_width(b);  // number of columns
+  dim_t d_mr = 8, d_nr = 8;
+
+  bool transa = bli_obj_has_trans(a);
+  dim_t cs_a, rs_a;
+  double ones = 1.0;
+
+  // Swap rs_a & cs_a in case of non-transpose.
+  if (transa)
+  {
+    cs_a = bli_obj_col_stride(a); // column stride of A
+    rs_a = bli_obj_row_stride(a); // row stride of A
+  }
+  else
+  {
+    cs_a = bli_obj_row_stride(a); // row stride of A
+    rs_a = bli_obj_col_stride(a); // column stride of A
+  }
+
+  dim_t cs_b = bli_obj_col_stride(b); // column stride of B
+
+  dim_t i, j, k;
+  dim_t k_iter;
+
+  bool is_unitdiag = bli_obj_has_unit_diag(a);
+
+  double AlphaVal = *(double *)AlphaObj->buffer;
+  double *restrict L = bli_obj_buffer_at_off(a); // pointer to matrix A
+  double *B = bli_obj_buffer_at_off(b); // pointer to matrix B
+
+  double *a01, *a11, *b10, *b11; // pointers for GEMM and TRSM blocks
+
+  bool required_packing_A = true;
+  mem_t local_mem_buf_A_s = {0};
+  double *D_A_pack = NULL; // pointer to A01 pack buffer
+  double d11_pack[d_mr] __attribute__((aligned(64))); // buffer for diagonal A pack
+  rntm_t rntm;
+
+  bli_rntm_init_from_global(&rntm);
+  bli_rntm_set_num_threads_only(1, &rntm);
+  bli_membrk_rntm_set_membrk(&rntm);
+
+  siz_t buffer_size = bli_pool_block_size(
+    bli_membrk_pool(
+      bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK),
+      bli_rntm_membrk(&rntm)));
+
+  if ((d_nr * n * sizeof(double)) > buffer_size)
+    return BLIS_NOT_YET_IMPLEMENTED;
+
+  if (required_packing_A)
+  {
+    // Get the buffer from the pool.
+    bli_membrk_acquire_m(&rntm,
+               buffer_size,
+               BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
+               &local_mem_buf_A_s); // acquire memory for A01 pack
+    if (FALSE == bli_mem_is_alloc(&local_mem_buf_A_s))
+      return BLIS_NULL_POINTER;
+    D_A_pack = bli_mem_buffer(&local_mem_buf_A_s);
+    if (NULL == D_A_pack)
+      return BLIS_NULL_POINTER;
+  }
+  __m512d zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11;
+  __m512d zmm12, zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, zmm21;
+  __m512d zmm22, zmm23, zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
+  __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11;
+  __m256d ymm12, ymm13, ymm14, ymm15, ymm16, ymm17, ymm18, ymm19, ymm20, ymm21;
+  __m256d ymm22, ymm23, ymm24, ymm25, ymm26, ymm27, ymm28, ymm29, ymm30, ymm31;
+  __m128d xmm5, xmm0;
+
+  //gcc12 throws a unitialized warning,
+  //To avoid that these variable are set to zero.
+  xmm5 = _mm_setzero_pd();
+  ymm0 = _mm256_setzero_pd();
+  ymm6 = _mm256_setzero_pd();
+
+
+  /*
+    Performs solving TRSM for 8 rows at a time from  0 to n/8 in steps of d_nr
+    a. Load and pack A (a01 block), the size of packing 8x8 to 8x(n-8)
+        First there will be no GEMM and no packing of a01 because it is only TRSM
+    b. Using packed a01 block and b10 block perform GEMM operation
+    c. Use GEMM outputs, perform TRSM operation using a11, b11 and update B
+    d. Repeat b for m cols of B in steps of d_mr
+  */
+  for (j = (n - d_nr); j > -1; j -= d_nr) //loop along 'N' direction
+  {
+    a01 = L + (j * rs_a) + (j + d_nr) * cs_a;  //pointer to block of A to be used in GEMM
+    a11 = L + (j * cs_a) + (j * rs_a);         //pointer to block of A to be used for TRSM
+
+    dim_t p_lda = (n - j - d_nr);              //packed leading dimension
+
+    // perform copy of A to packed buffer D_A_pack
+    if (transa)
+    {
+      /*
+      Pack current A block (a01) into packed buffer memory D_A_pack
+        a. This a10 block is used in GEMM portion only and this
+            a01 block size will be increasing by d_nr for every next iteration
+            until it reaches 8x(n-8) which is the maximum GEMM alone block size in A
+        b. This packed buffer is reused to calculate all m cols of B matrix
+      */
+      bli_dtrsm_small_pack_avx512
+      (
+        'R',
+        p_lda,
+        1,
+        a01,
+        cs_a,
+        D_A_pack,
+        p_lda,
+        d_nr
+      );
+      /*
+        Pack 8 diagonal elements of A block into an array
+        a. This helps to utilize cache line efficiently in TRSM operation
+        b. store ones when input is unit diagonal
+      */
+      dtrsm_small_pack_diag_element_avx512
+      (
+        is_unitdiag,
+        a11,
+        cs_a,
+        d11_pack,
+        d_nr
+      );
+    }
+    else
+    {
+      bli_dtrsm_small_pack_avx512
+      (
+        'R',
+        p_lda,
+        0,
+        a01,
+        rs_a,
+        D_A_pack,
+        p_lda,
+        d_nr
+      );
+      dtrsm_small_pack_diag_element_avx512
+      (
+        is_unitdiag,
+        a11,
+        rs_a,
+        d11_pack,
+        d_nr
+      );
+    }
+
+    /*
+      a. Perform GEMM using a01, b10.
+      b. Perform TRSM on a11, b11
+      c. This loop GEMM+TRSM loops operates with 8x6 block size
+          along m dimension for every d_mr columns of B10 where
+          packed A buffer is reused in computing all m cols of B.
+      d. Same approach is used in remaining fringe cases.
+    */
+    for (i = (m - d_mr); (i + 1) > 0; i -= d_mr) //loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L + j * cs_a + j * rs_a;   //pointer to block of A to be used for TRSM
+      b10 = B + i + (j + d_nr) * cs_b; //pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;          //pointer to block of B to be used for TRSM
+
+      k_iter = (n - j - d_nr);
+      BLIS_SET_ZMM_REG_ZEROS
+      /*
+        Perform GEMM between a01 and b10 blocks
+        For first iteration there will be no GEMM operation
+        where k_iter are zero
+      */
+      BLIS_DTRSM_SMALL_GEMM_8nx8m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11);
+      /*
+        Load b11 of size 8x8 and multiply with alpha
+        Add the GEMM output to b11
+        and perform TRSM operation.
+      */
+      BLIS_PRE_DTRSM_SMALL_8x8(AlphaVal, b11, cs_b)
+
+
+
+      /*
+        Compute 8x8 TRSM block by using GEMM block output in register
+        a. The 8x8 input (gemm outputs) are stored in combinations of zmm registers
+            row      :   0     1    2      3     4     5     6     7
+            register : zmm9  zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16
+        b. Towards the end TRSM output will be stored back into b11
+      */
+
+      /*
+      *                                         to i=7
+      *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[N][i]  } /A11[N][N]
+      *                                       from i=n+1
+      *
+      *  For example 3rd column (B11[2]) -= ((B11[3] * A11[2][3]) + (B11[4] * A11[2][4]) +
+      *                                      (B11[5] * A11[2][5]) + (B11[6] * A11[2][6]) +
+      *                                      (B11[7] * A11[2][7])) / A11[2][2]
+      *                          zmm11   -= ((zmm12  * A11[2][3]) + (zmm13  * A11[2][4]) +
+      *                                      (zmm14  * A11[2][5]) + (zmm15  * A11[2][6]) +
+      *                                      (zmm16  * A11[2][7])) / A11[2][2]
+      */
+
+      // extract a77
+      zmm0  = _mm512_set1_pd(*(d11_pack + 7));
+      zmm16 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm16, zmm0);
+      _mm512_storeu_pd((double *)(b11 + 7 * cs_b), zmm16);
+
+      // extract a66
+      zmm0  = _mm512_set1_pd(*(a11 + (7 * cs_a) + (6 * rs_a)));
+      zmm1  = _mm512_set1_pd(*(a11 + (7 * cs_a) + (5 * rs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm0, zmm16, zmm15);
+      zmm0  = _mm512_set1_pd(*(a11 + (7 * cs_a) + (4 * rs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm1, zmm16, zmm14);
+      zmm1  = _mm512_set1_pd(*(a11 + (7 * cs_a) + (3 * rs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm0, zmm16, zmm13);
+      zmm0  = _mm512_set1_pd(*(a11 + (7 * cs_a) + (2 * rs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm1, zmm16, zmm12);
+      zmm1  = _mm512_set1_pd(*(a11 + (7 * cs_a) + (1 * rs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm16, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (7 * cs_a) + (0 * rs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm16, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 6));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm16, zmm9);
+      zmm15 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm15, zmm1);
+      _mm512_storeu_pd((double *)(b11 + (6 * cs_b)), zmm15);
+
+      // extract a55
+      zmm1  = _mm512_set1_pd(*(a11 + (6 * cs_a) + (5 * rs_a)));
+      zmm0  = _mm512_set1_pd(*(a11 + (6 * cs_a) + (4 * rs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm1, zmm15, zmm14);
+      zmm1  = _mm512_set1_pd(*(a11 + (6 * cs_a) + (3 * rs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm0, zmm15, zmm13);
+      zmm0  = _mm512_set1_pd(*(a11 + (6 * cs_a) + (2 * rs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm1, zmm15, zmm12);
+      zmm1  = _mm512_set1_pd(*(a11 + (6 * cs_a) + (1 * rs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm15, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (6 * cs_a) + (0 * rs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm15, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 5));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm15, zmm9);
+      zmm14 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm14, zmm1);
+      _mm512_storeu_pd((double *)(b11 + (5 * cs_b)), zmm14);
+
+      // extract a44
+      zmm0  = _mm512_set1_pd(*(a11 + (5 * cs_a) + (4 * rs_a)));
+      zmm1  = _mm512_set1_pd(*(a11 + (5 * cs_a) + (3 * rs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm0, zmm14, zmm13);
+      zmm0  = _mm512_set1_pd(*(a11 + (5 * cs_a) + (2 * rs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm1, zmm14, zmm12);
+      zmm1  = _mm512_set1_pd(*(a11 + (5 * cs_a) + (1 * rs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm14, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (5 * cs_a) + (0 * rs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm14, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 4));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm14, zmm9);
+      zmm13 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm13, zmm1);
+      _mm512_storeu_pd((double *)(b11 + (4 * cs_b)), zmm13);
+
+      // extract a33
+      zmm1  = _mm512_set1_pd(*(a11 + (4 * cs_a) + (3 * rs_a)));
+      zmm0  = _mm512_set1_pd(*(a11 + (4 * cs_a) + (2 * rs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm1, zmm13, zmm12);
+      zmm1  = _mm512_set1_pd(*(a11 + (4 * cs_a) + (1 * rs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm13, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (4 * cs_a) + (0 * rs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm13, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 3));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm13, zmm9);
+      zmm12 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm12, zmm1);
+      _mm512_storeu_pd((double *)(b11 + (3 * cs_b)), zmm12);
+
+      // extract a22
+      zmm0  = _mm512_set1_pd(*(a11 + (3 * cs_a) + (2 * rs_a)));
+      zmm1  = _mm512_set1_pd(*(a11 + (3 * cs_a) + (1 * rs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm12, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (3 * cs_a) + (0 * rs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm12, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 2));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm12, zmm9);
+      zmm11 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm11, zmm1);
+      _mm512_storeu_pd((double *)(b11 + (2 * cs_b)), zmm11);
+
+      // extract a11
+      zmm1  = _mm512_set1_pd(*(a11 + (2 * cs_a) + (1 * rs_a)));
+      zmm0  = _mm512_set1_pd(*(a11 + (2 * cs_a) + (0 * rs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm11, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 1));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm11, zmm9);
+      zmm10 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm10, zmm1);
+      _mm512_storeu_pd((double *)(b11 + (1 * cs_b)), zmm10);
+
+      // extract a00
+      zmm1 = _mm512_set1_pd(*(a11 + (1 * cs_a) + (0 * rs_a)));
+      zmm0 = _mm512_set1_pd(*(d11_pack + 0));
+      zmm9 = _mm512_fnmadd_pd(zmm1, zmm10, zmm9);
+      zmm9 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm9, zmm0);
+      _mm512_storeu_pd((double *)(b11 + (0 * cs_b)), zmm9);
+    }
+    dim_t m_remainder = i + d_mr;
+    if(m_remainder)
+    {
+      if (m_remainder >= 4) //loop along 'M' direction
+      {
+        a01 = D_A_pack;
+        a11 = L + (j * cs_a) + (j * rs_a);               //pointer to block of A to be used for TRSM
+        b10 = B + (m_remainder - 4) + (j + d_nr) * cs_b; //pointer to block of B to be used in GEMM
+        b11 = B + (m_remainder - 4) + (j * cs_b);        //pointer to block of B to be used for TRSM
+
+        k_iter = (n - j - d_nr);
+        BLIS_SET_YMM_REG_ZEROS_AVX512
+        /*
+          Perform GEMM between a01 and b10 blocks
+          For first iteration there will be no GEMM operation
+          where k_iter are zero
+        */
+        BLIS_DTRSM_SMALL_GEMM_8nx4m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11)
+        /*
+          Load b11 of size 8x4 and multiply with alpha
+          Add the GEMM output to b11
+          and perform TRSM operation.
+        */
+        BLIS_PRE_DTRSM_SMALL_8x4(AlphaVal, b11, cs_b)
+
+        /*
+          Compute 8x4 TRSM block by using GEMM block output in register
+          a. The 8x4 input (gemm outputs) are stored in combinations of ymm registers
+              row      :   0     1    2      3     4     5     6     7
+              register : ymm9  ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 ymm16
+          b. Towards the end TRSM output will be stored back into b11
+        */
+
+        /*
+        *                                         to i=7
+        *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[N][i]  } /A11[N][N]
+        *                                       from i=n+1
+        *
+        *  For example 3rd column (B11[2]) -= ((B11[3] * A11[2][3]) + (B11[4] * A11[2][4]) +
+        *                                      (B11[5] * A11[2][5]) + (B11[6] * A11[2][6]) +
+        *                                      (B11[7] * A11[2][7])) / A11[2][2]
+        */
+
+        // extract a77
+        ymm0  = _mm256_broadcast_sd((d11_pack + 7));
+        ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm0);
+        _mm256_storeu_pd((double *)(b11 + (7 * cs_b)), ymm16);
+
+        // extract a66
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (6 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (5 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm0, ymm16, ymm15);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (4 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm1, ymm16, ymm14);
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm16, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm16, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm16, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm16, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 6));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm16, ymm9);
+        ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+        _mm256_storeu_pd((double *)(b11 + (6 * cs_b)), ymm15);
+
+        // extract a55
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (5 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (4 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm1, ymm15, ymm14);
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm15, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm15, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm15, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm15, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 5));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm15, ymm9);
+        ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+        _mm256_storeu_pd((double *)(b11 + (5 * cs_b)), ymm14);
+
+        // extract a44
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (4 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm14, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm14, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm14, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm14, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 4));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm14, ymm9);
+        ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+        _mm256_storeu_pd((double *)(b11 + (4 * cs_b)), ymm13);
+
+        // extract a33
+        ymm1  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (3 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm13, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm13, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm13, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 3));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm13, ymm9);
+        ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+        _mm256_storeu_pd((double *)(b11 + (3 * cs_b)), ymm12);
+
+        // extract a22
+        ymm0  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (2 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm12, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm12, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 2));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm12, ymm9);
+        ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+        _mm256_storeu_pd((double *)(b11 + (2 * cs_b)), ymm11);
+
+        // extract a11
+        ymm1  = _mm256_broadcast_sd((a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (2 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm11, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 1));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm11, ymm9);
+        ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+        _mm256_storeu_pd((double *)(b11 + (1 * cs_b)), ymm10);
+
+        // extract a00
+        ymm1 = _mm256_broadcast_sd((a11 + (1 * cs_a) + (0 * rs_a)));
+        ymm0 = _mm256_broadcast_sd((d11_pack + 0));
+        ymm9 = _mm256_fnmadd_pd(ymm1, ymm10, ymm9);
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+        _mm256_storeu_pd((double *)(b11 + (0 * cs_b)), ymm9);
+        m_remainder -= 4;
+      }
+      if (m_remainder == 3) //loop along 'M' direction
+      {
+        a01 = D_A_pack;
+        a11 = L + (j * cs_a) + (j * rs_a);               //pointer to block of A to be used for TRSM
+        b10 = B + (j + d_nr) * cs_b; // pointer to block of B to be used in GEMM
+        b11 = B + (j * cs_b);        //pointer to block of B to be used for TRSM
+
+        k_iter = (n - j - d_nr);
+        BLIS_SET_YMM_REG_ZEROS_AVX512
+        /*
+          Perform GEMM between a01 and b10 blocks
+          For first iteration there will be no GEMM operation
+          where k_iter are zero
+        */
+        BLIS_DTRSM_SMALL_GEMM_8nx3m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11)
+        /*
+          Load b11 of size 8x3 and multiply with alpha
+          Add the GEMM output to b11
+          and perform TRSM operation.
+        */
+        BLIS_PRE_DTRSM_SMALL_8x3(AlphaVal, b11, cs_b)
+        /*
+          Compute 8x3 TRSM block by using GEMM block output in register
+          a. The 8x3 input (gemm outputs) are stored in combinations of ymm registers
+              row      :   0     1    2      3     4     5     6     7
+              register : ymm9  ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 ymm16
+          b. Towards the end TRSM output will be stored back into b11
+        */
+
+        /*
+        *                                         to i=7
+        *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[N][i]  } /A11[N][N]
+        *                                       from i=n+1
+        *
+        *  For example 3rd column (B11[2]) -= ((B11[3] * A11[2][3]) + (B11[4] * A11[2][4]) +
+        *                                      (B11[5] * A11[2][5]) + (B11[6] * A11[2][6]) +
+        *                                      (B11[7] * A11[2][7])) / A11[2][2]
+        */
+
+        // extract a77
+        ymm0  = _mm256_broadcast_sd((d11_pack + 7));
+        ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm0);
+        _mm_storeu_pd((double *)(b11 + (7 * cs_b) + 0), _mm256_castpd256_pd128(ymm16));
+        _mm_storel_pd((double *)(b11 + (7 * cs_b) + 2), _mm256_extractf64x2_pd(ymm16, 1));
+
+        // extract a66
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (6 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (5 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm0, ymm16, ymm15);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (4 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm1, ymm16, ymm14);
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm16, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm16, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm16, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm16, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 6));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm16, ymm9);
+        ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+        _mm_storeu_pd((double *)(b11 + (6 * cs_b) + 0), _mm256_castpd256_pd128(ymm15));
+        _mm_storel_pd((double *)(b11 + (6 * cs_b) + 2), _mm256_extractf64x2_pd(ymm15, 1));
+
+        // extract a55
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (5 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (4 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm1, ymm15, ymm14);
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm15, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm15, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm15, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm15, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 5));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm15, ymm9);
+        ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+        _mm_storeu_pd((double *)(b11 + (5 * cs_b) + 0), _mm256_castpd256_pd128(ymm14));
+        _mm_storel_pd((double *)(b11 + (5 * cs_b) + 2), _mm256_extractf64x2_pd(ymm14, 1));
+
+        // extract a44
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (4 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm14, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm14, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm14, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm14, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 4));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm14, ymm9);
+        ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+        _mm_storeu_pd((double *)(b11 + (4 * cs_b) + 0), _mm256_castpd256_pd128(ymm13));
+        _mm_storel_pd((double *)(b11 + (4 * cs_b) + 2), _mm256_extractf64x2_pd(ymm13, 1));
+
+        // extract a33
+        ymm1  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (3 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm13, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm13, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm13, ymm10);
+        ymm1 = _mm256_broadcast_sd((d11_pack + 3));
+        ymm9 = _mm256_fnmadd_pd(ymm0, ymm13, ymm9);
+        ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+        _mm_storeu_pd((double *)(b11 + (3 * cs_b) + 0), _mm256_castpd256_pd128(ymm12));
+        _mm_storel_pd((double *)(b11 + (3 * cs_b) + 2), _mm256_extractf64x2_pd(ymm12, 1));
+
+        // extract a22
+        ymm0  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (2 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm12, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm12, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 2));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm12, ymm9);
+        ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+        _mm_storeu_pd((double *)(b11 + (2 * cs_b) + 0), _mm256_castpd256_pd128(ymm11));
+        _mm_storel_pd((double *)(b11 + (2 * cs_b) + 2), _mm256_extractf64x2_pd(ymm11, 1));
+
+        // extract a11
+        ymm1  = _mm256_broadcast_sd((a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (2 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm11, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 1));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm11, ymm9);
+        ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+        _mm_storeu_pd((double *)(b11 + (1 * cs_b) + 0), _mm256_castpd256_pd128(ymm10));
+        _mm_storel_pd((double *)(b11 + (1 * cs_b) + 2), _mm256_extractf64x2_pd(ymm10, 1));
+
+        // extract a00
+        ymm1 = _mm256_broadcast_sd((a11 + (1 * cs_a) + (0 * rs_a)));
+        ymm0 = _mm256_broadcast_sd((d11_pack + 0));
+        ymm9 = _mm256_fnmadd_pd(ymm1, ymm10, ymm9);
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+        _mm_storeu_pd((double *)(b11 + (0 * cs_b) + 0), _mm256_castpd256_pd128(ymm9));
+        _mm_storel_pd((double *)(b11 + (0 * cs_b) + 2), _mm256_extractf64x2_pd(ymm9, 1));
+        m_remainder -= 3;
+      }
+      else if (m_remainder == 2) //loop along 'M' direction
+      {
+        a01 = D_A_pack;
+        a11 = L + (j * cs_a) + (j * rs_a);               //pointer to block of A to be used for TRSM
+        b10 = B + (j + d_nr) * cs_b; // pointer to block of B to be used in GEMM
+        b11 = B + (j * cs_b);        //pointer to block of B to be used for TRSM
+
+        k_iter = (n - j - d_nr);
+        BLIS_SET_YMM_REG_ZEROS_AVX512
+        BLIS_DTRSM_SMALL_GEMM_8nx2m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11)
+        BLIS_PRE_DTRSM_SMALL_8x2(AlphaVal, b11, cs_b)
+        /*
+          Compute 8x2 TRSM block by using GEMM block output in register
+          a. The 8x2 input (gemm outputs) are stored in combinations of zmm registers
+              row      :   0     1    2      3     4     5     6     7
+              register : ymm9  ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 ymm16
+          b. Towards the end TRSM output will be stored back into b11
+        */
+
+        /*
+        *                                         to i=7
+        *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[N][i]  } /A11[N][N]
+        *                                       from i=n+1
+        *
+        *  For example 3rd column (B11[2]) -= ((B11[3] * A11[2][3]) + (B11[4] * A11[2][4]) +
+        *                                      (B11[5] * A11[2][5]) + (B11[6] * A11[2][6]) +
+        *                                      (B11[7] * A11[2][7])) / A11[2][2]
+        */
+
+        // extract a77
+        ymm0  = _mm256_broadcast_sd((d11_pack + 7));
+        ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm0);
+        _mm_storeu_pd((double *)(b11 + (7 * cs_b)), _mm256_castpd256_pd128(ymm16));
+
+        // extract a66
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (6 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (5 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm0, ymm16, ymm15);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (4 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm1, ymm16, ymm14);
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm16, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm16, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm16, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm16, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 6));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm16, ymm9);
+        ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+        _mm_storeu_pd((double *)(b11 + (6 * cs_b)), _mm256_castpd256_pd128(ymm15));
+
+        // extract a55
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (5 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (4 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm1, ymm15, ymm14);
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm15, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm15, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm15, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm15, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 5));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm15, ymm9);
+        ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+        _mm_storeu_pd((double *)(b11 + (5 * cs_b)), _mm256_castpd256_pd128(ymm14));
+
+        // extract a44
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (4 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm14, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm14, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm14, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm14, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 4));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm14, ymm9);
+        ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+        _mm_storeu_pd((double *)(b11 + (4 * cs_b)), _mm256_castpd256_pd128(ymm13));
+
+        // extract a33
+        ymm1  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (3 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm13, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm13, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm13, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 3));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm13, ymm9);
+        ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+        _mm_storeu_pd((double *)(b11 + (3 * cs_b)), _mm256_castpd256_pd128(ymm12));
+
+        // extract a22
+        ymm0  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (2 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm12, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm12, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 2));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm12, ymm9);
+        ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+        _mm_storeu_pd((double *)(b11 + (2 * cs_b)), _mm256_castpd256_pd128(ymm11));
+
+        // extract a11
+        ymm1  = _mm256_broadcast_sd((a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (2 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm11, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 1));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm11, ymm9);
+        ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+        _mm_storeu_pd((double *)(b11 + (1 * cs_b)), _mm256_castpd256_pd128(ymm10));
+
+        // extract a00
+        ymm1 = _mm256_broadcast_sd((a11 + (1 * cs_a) + (0 * rs_a)));
+        ymm0 = _mm256_broadcast_sd((d11_pack + 0));
+        ymm9 = _mm256_fnmadd_pd(ymm1, ymm10, ymm9);
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+        _mm_storeu_pd((double *)(b11 + (0 * cs_b)), _mm256_castpd256_pd128(ymm9));
+        m_remainder -= 2;
+      }
+      else if (m_remainder == 1)  //loop along 'M' direction
+      {
+        a01 = D_A_pack;
+        a11 = L + (j * cs_a) + (j * rs_a);               //pointer to block of A to be used for TRSM
+        b10 = B + (j + d_nr) * cs_b; //pointer to block of B to be used in GEMM
+        b11 = B + (j * cs_b);        //pointer to block of B to be used for TRSM
+
+        k_iter = (n - j - d_nr);
+        BLIS_SET_YMM_REG_ZEROS_AVX512
+        BLIS_DTRSM_SMALL_GEMM_8nx1m_AVX512(a01, b10, cs_b, p_lda, k_iter, b11);
+        BLIS_PRE_DTRSM_SMALL_8x1(AlphaVal, b11, cs_b)
+        /*
+          Compute 8x1 TRSM block by using GEMM block output in register
+          a. The 8x1 input (gemm outputs) are stored in combinations of zmm registers
+              row      :   0     1    2      3     4     5     6     7
+              register : ymm9  ymm10 ymm11 ymm12 ymm13 ymm14 ymm15 ymm16
+          b. Towards the end TRSM output will be stored back into b11
+        */
+
+        /*
+        *                                         to i=7
+        *  B11[Nth column] = GEMM(Nth column) -     Σ  {  B11[i] * A11[N][i]  } /A11[N][N]
+        *                                       from i=n+1
+        *
+        *  For example 3rd column (B11[2]) -= ((B11[3] * A11[2][3]) + (B11[4] * A11[2][4]) +
+        *                                      (B11[5] * A11[2][5]) + (B11[6] * A11[2][6]) +
+        *                                      (B11[7] * A11[2][7])) / A11[2][2]
+        */
+
+        // extract a77
+        ymm0  = _mm256_broadcast_sd((d11_pack + 7));
+        ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm0);
+        _mm_storel_pd((double *)(b11 + (7 * cs_b)), _mm256_castpd256_pd128(ymm16));
+
+        // extract a66
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (6 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (5 * rs_a)));
+        ymm15 = _mm256_fnmadd_pd(ymm0, ymm16, ymm15);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (4 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm1, ymm16, ymm14);
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm16, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm16, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm16, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (7 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm16, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 6));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm16, ymm9);
+        ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+        _mm_storel_pd((double *)(b11 + (6 * cs_b)), _mm256_castpd256_pd128(ymm15));
+
+        // extract a55
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (5 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (4 * rs_a)));
+        ymm14 = _mm256_fnmadd_pd(ymm1, ymm15, ymm14);
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm15, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm15, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm15, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (6 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm15, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 5));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm15, ymm9);
+        ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+        _mm_storel_pd((double *)(b11 + (5 * cs_b)), _mm256_castpd256_pd128(ymm14));
+
+        // extract a44
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (4 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (3 * rs_a)));
+        ymm13 = _mm256_fnmadd_pd(ymm0, ymm14, ymm13);
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm14, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm14, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (5 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm14, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 4));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm14, ymm9);
+        ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+        _mm_storel_pd((double *)(b11 + (4 * cs_b)), _mm256_castpd256_pd128(ymm13));
+
+        // extract a33
+        ymm1  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (3 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (2 * rs_a)));
+        ymm12 = _mm256_fnmadd_pd(ymm1, ymm13, ymm12);
+        ymm1  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm13, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (4 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm13, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 3));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm13, ymm9);
+        ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+        _mm_storel_pd((double *)(b11 + (3 * cs_b)), _mm256_castpd256_pd128(ymm12));
+
+        // extract a22
+        ymm0  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (2 * rs_a)));
+        ymm1  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (1 * rs_a)));
+        ymm11 = _mm256_fnmadd_pd(ymm0, ymm12, ymm11);
+        ymm0  = _mm256_broadcast_sd((a11 + (3 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm12, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 2));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm12, ymm9);
+        ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+        _mm_storel_pd((double *)(b11 + (2 * cs_b)), _mm256_castpd256_pd128(ymm11));
+
+        // extract a11
+        ymm1  = _mm256_broadcast_sd((a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm0  = _mm256_broadcast_sd((a11 + (2 * cs_a) + (0 * rs_a)));
+        ymm10 = _mm256_fnmadd_pd(ymm1, ymm11, ymm10);
+        ymm1  = _mm256_broadcast_sd((d11_pack + 1));
+        ymm9  = _mm256_fnmadd_pd(ymm0, ymm11, ymm9);
+        ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+        _mm_storel_pd((double *)(b11 + (1 * cs_b)), _mm256_castpd256_pd128(ymm10));
+
+        // extract a00
+        ymm1 = _mm256_broadcast_sd((a11 + (1 * cs_a) + (0 * rs_a)));
+        ymm0 = _mm256_broadcast_sd((d11_pack + 0));
+        ymm9 = _mm256_fnmadd_pd(ymm1, ymm10, ymm9);
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+        _mm_storel_pd((double *)(b11 + (0 * cs_b)), _mm256_castpd256_pd128(ymm9));
+        m_remainder -= 1;
+    }
+    }
+  }
+
+  dim_t n_remainder = j + d_nr;
+
+  /*
+  Reminder cases starts here:
+  a. Similar logic and code flow used in computing full block (8x8)
+     above holds for reminder cases too.
+  */
+
+  if (n_remainder >= 4)
+  {
+    a01 = L + (n_remainder - 4) * rs_a + n_remainder * cs_a;     // pointer to block of A to be used in GEMM
+    a11 = L + (n_remainder - 4) * cs_a + (n_remainder - 4) * rs_a; // pointer to block of A to be used for TRSM
+
+    double *ptr_a10_dup = D_A_pack;
+
+    dim_t p_lda = (n - n_remainder); // packed leading dimension
+    // perform copy of A to packed buffer D_A_pack
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < p_lda; x += 1)
+      {
+        bli_dcopys(*(a01 + rs_a * 0), *(ptr_a10_dup + (p_lda * 0)));
+        bli_dcopys(*(a01 + rs_a * 1), *(ptr_a10_dup + (p_lda * 1)));
+        bli_dcopys(*(a01 + rs_a * 2), *(ptr_a10_dup + (p_lda * 2)));
+        bli_dcopys(*(a01 + rs_a * 3), *(ptr_a10_dup + (p_lda * 3)));
+        ptr_a10_dup += 1;
+        a01 += cs_a;
+      }
+    }
+    else
+    {
+      dim_t loop_count = (n - n_remainder) / 4;
+
+      for (dim_t x = 0; x < loop_count; x++)
+      {
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 0) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 1) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 2) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 2) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 3) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 3) + (x * 4)), ymm15);
+      }
+
+      dim_t remainder_loop_count = p_lda - loop_count * 4;
+
+      __m128d xmm0;
+      if (remainder_loop_count != 0)
+      {
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 0) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 1) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 2) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 2) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 3) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 3) + (loop_count * 4)), xmm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    // read diagonal from a11 if not unit diagonal
+    if (!is_unitdiag)
+    {
+      if (transa)
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 1) + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 2) + 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 3) + 3));
+      }
+      else
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 1) + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 2) + 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 3) + 3));
+      }
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (i = (m - d_mr); (i + 1) > 0; i -= d_mr) // loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L + (n_remainder - 4) * cs_a + (n_remainder - 4) * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + i + (n_remainder)*cs_b;                // pointer to block of B to be used in GEMM
+      b11 = B + (i) + (n_remainder - 4) * cs_b;            // pointer to block of B to be used for TRSM
+
+      k_iter = (n - n_remainder); // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_4nx8m(a01, b10, cs_b, p_lda, k_iter)
+
+      BLIS_PRE_DTRSM_SMALL_4x8(AlphaVal, b11, cs_b)
+
+      /// implement TRSM///
+
+      // extract a33
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm0);
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(Row 3): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (2 * rs_a)));
+
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm9, ymm7);
+      ymm8 = _mm256_fnmadd_pd(ymm1, ymm10, ymm8);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (1 * rs_a)));
+
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm9, ymm5);
+      ymm6 = _mm256_fnmadd_pd(ymm1, ymm10, ymm6);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a)));
+
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm9, ymm3);
+      ymm4 = _mm256_fnmadd_pd(ymm1, ymm10, ymm4);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+      ymm6 = _mm256_fnmadd_pd(ymm1, ymm8, ymm6);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+      ymm4 = _mm256_fnmadd_pd(ymm1, ymm8, ymm4);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+      ymm6 = DTRSM_SMALL_DIV_OR_SCALE(ymm6, ymm0);
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      //(Row 1): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+      ymm4 = _mm256_fnmadd_pd(ymm1, ymm6, ymm4);
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+      ymm4 = DTRSM_SMALL_DIV_OR_SCALE(ymm4, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4), ymm4);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + (cs_b + 4)), ymm6);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2)), ymm7);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2) + 4), ymm8);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 3)), ymm9);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 3) + 4), ymm10);
+    }
+
+    dim_t m_remainder = i + d_mr;
+    if (m_remainder >= 4)
+    {
+      a01 = D_A_pack;
+      a11 = L + (n_remainder - 4) * cs_a + (n_remainder - 4) * rs_a; // pointer to block of A to be used for TRSM
+      b10 = B + (m_remainder - 4) + (n_remainder)*cs_b;        // pointer to block of B to be used in GEMM
+      b11 = B + (m_remainder - 4) + (n_remainder - 4) * cs_b;    // pointer to block of B to be used for TRSM
+
+      k_iter = (n - n_remainder); // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_4nx4m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 3)));
+      // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+      ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9);
+      // B11[0-3][3] * alpha -= ymm6
+
+      /// implement TRSM///
+
+      // extract a33
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(Row 3): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (2 * rs_a)));
+      ymm7 = _mm256_fnmadd_pd(ymm1, ymm9, ymm7);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm9, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a)));
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm9, ymm3);
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      //(Row 1): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2)), ymm7);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 3)), ymm9);
+
+      m_remainder -= 4;
+    }
+
+    if (m_remainder)
+    {
+      if (m_remainder == 3)
+      {
+        a01 = D_A_pack;
+        a11 = L + (n_remainder - 4) * cs_a + (n_remainder - 4) * rs_a; // pointer to block of A to be used for TRSM
+        b10 = B + (n_remainder)*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B + (n_remainder - 4) * cs_b;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - n_remainder); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_4nx3m(a01, b10, cs_b, p_lda, k_iter)
+
+        ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)b11);
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+        // B11[0-3][0] * alpha -= ymm0
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+        // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+        ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+        // B11[0-3][1] * alpha-= ymm2
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+        // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+        ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+        // B11[0-3][2] * alpha -= ymm4
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 3)));
+        ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 3) + 2));
+        ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);
+        ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9);
+        // B11[0-3][3] * alpha -= ymm6
+
+        /// implement TRSM///
+
+        // extract a33
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+        //(Row 3): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (2 * rs_a)));
+        ymm7 = _mm256_fnmadd_pd(ymm1, ymm9, ymm7);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm9, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm9, ymm3);
+
+        ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+        //(row 2):FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        _mm_storeu_pd((double *)b11, _mm256_castpd256_pd128(ymm3));
+        _mm_storeu_pd((double *)(b11 + cs_b), _mm256_castpd256_pd128(ymm5));
+        _mm_storeu_pd((double *)(b11 + (cs_b * 2)), _mm256_castpd256_pd128(ymm7));
+        _mm_storeu_pd((double *)(b11 + (cs_b * 3)), _mm256_castpd256_pd128(ymm9));
+
+        _mm_storel_pd((double *)b11 + 2, _mm256_extractf128_pd(ymm3, 1));
+        _mm_storel_pd((double *)(b11 + cs_b + 2), _mm256_extractf128_pd(ymm5, 1));
+        _mm_storel_pd((double *)(b11 + (cs_b * 2) + 2), _mm256_extractf128_pd(ymm7, 1));
+        _mm_storel_pd((double *)(b11 + (cs_b * 3) + 2), _mm256_extractf128_pd(ymm9, 1));
+
+        m_remainder -= 3;
+      }
+      else if (m_remainder == 2)
+      {
+        a01 = D_A_pack;
+        a11 = L + (n_remainder - 4) * cs_a + (n_remainder - 4) * rs_a; // pointer to block of A to be used for TRSM
+        b10 = B + (n_remainder)*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B + (n_remainder - 4) * cs_b;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - n_remainder); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_4nx2m(a01, b10, cs_b, p_lda, k_iter)
+
+        ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)b11);
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+        // B11[0-3][0] * alpha -= ymm0
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+        // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+        ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+        // B11[0-3][1] * alpha-= ymm2
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+        // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+        ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+        // B11[0-3][2] * alpha -= ymm4
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 3)));
+        ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);
+        ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9);
+        // B11[0-3][3] * alpha -= ymm6
+
+        /// implement TRSM///
+
+        // extract a33
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+        //(Row 3): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (2 * rs_a)));
+        ymm7 = _mm256_fnmadd_pd(ymm1, ymm9, ymm7);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm9, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm9, ymm3);
+
+        ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+        //(row 2):FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        _mm_storeu_pd((double *)b11, _mm256_castpd256_pd128(ymm3));
+        _mm_storeu_pd((double *)(b11 + cs_b), _mm256_castpd256_pd128(ymm5));
+        _mm_storeu_pd((double *)(b11 + (cs_b * 2)), _mm256_castpd256_pd128(ymm7));
+        _mm_storeu_pd((double *)(b11 + (cs_b * 3)), _mm256_castpd256_pd128(ymm9));
+
+        m_remainder -= 2;
+      }
+      else if (m_remainder == 1)
+      {
+        a01 = D_A_pack;
+        a11 = L + (n_remainder - 4) * cs_a + (n_remainder - 4) * rs_a; // pointer to block of A to be used for TRSM
+        b10 = B + (n_remainder)*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B + (n_remainder - 4) * cs_b;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - n_remainder); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_4nx1m(a01, b10, cs_b, p_lda, k_iter)
+
+        ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);// register to hold alpha
+
+        ymm0 = _mm256_broadcast_sd((double const *)b11);
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+        // B11[0-3][0] * alpha -= ymm0
+
+        ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b));
+        // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+        ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+        // B11[0-3][1] * alpha-= ymm2
+
+        ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 2)));
+        // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+        ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+        // B11[0-3][2] * alpha -= ymm4
+
+        ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 3)));
+        // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+        ymm9 = _mm256_fmsub_pd(ymm0, ymm15, ymm9);
+        // B11[0-3][3] * alpha -= ymm6
+
+        /// implement TRSM///
+
+        // extract a33
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+        ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+        //(Row 3): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (2 * rs_a)));
+        ymm7 = _mm256_fnmadd_pd(ymm1, ymm9, ymm7);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm9, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (3 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm9, ymm3);
+
+        ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+        //(row 2):FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        _mm_storel_pd((b11 + (cs_b * 0)), _mm256_castpd256_pd128(ymm3));
+        _mm_storel_pd((b11 + (cs_b * 1)), _mm256_castpd256_pd128(ymm5));
+        _mm_storel_pd((b11 + (cs_b * 2)), _mm256_castpd256_pd128(ymm7));
+        _mm_storel_pd((b11 + (cs_b * 3)), _mm256_castpd256_pd128(ymm9));
+
+        m_remainder -= 1;
+      }
+    }
+    n_remainder -= 4;
+  }
+
+  if (n_remainder == 3)
+  {
+    a01 = L + 3*cs_a;     // pointer to block of A to be used in GEMM
+    a11 = L; // pointer to block of A to be used for TRSM
+
+    double *ptr_a10_dup = D_A_pack;
+
+    dim_t p_lda = (n - 3); // packed leading dimension
+    // perform copy of A to packed buffer D_A_pack
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < p_lda; x += 1)
+      {
+        bli_dcopys(*(a01 + rs_a * 0), *(ptr_a10_dup + p_lda * 0));
+        bli_dcopys(*(a01 + rs_a * 1), *(ptr_a10_dup + p_lda * 1));
+        bli_dcopys(*(a01 + rs_a * 2), *(ptr_a10_dup + p_lda * 2));
+        ptr_a10_dup += 1;
+        a01 += cs_a;
+      }
+    }
+    else
+    {
+      dim_t loop_count = (n - 3) / 4;
+
+      for (dim_t x = 0; x < loop_count; x++)
+      {
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 0) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 1) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 2) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 2) + (x * 4)), ymm15);
+      }
+
+      dim_t remainder_loop_count = p_lda - loop_count * 4;
+
+      __m128d xmm0;
+      if (remainder_loop_count != 0)
+      {
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 0) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 1) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 2) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 2) + (loop_count * 4)), xmm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      if (transa)
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 1) + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 2) + 2));
+      }
+      else
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 1) + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 2) + 2));
+      }
+      ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (i = (m - d_mr); (i + 1) > 0; i -= d_mr) // loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L; // pointer to block of A to be used for TRSM
+      b10 = B + i + 3*cs_b;                // pointer to block of B to be used in GEMM
+      b11 = B + i;            // pointer to block of B to be used for TRSM
+
+      k_iter = (n - 3); // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_3nx8m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + 4));
+      // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+      ymm4 = _mm256_fmsub_pd(ymm1, ymm15, ymm4);
+      // B11[4-7][0] * alpha-= ymm1
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b + 4));
+      // B11[4][1] B11[5][1] B11[6][1] B11[7][1]
+
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+      ymm6 = _mm256_fmsub_pd(ymm1, ymm15, ymm6);
+      // B11[4-7][1] * alpha -= ymm3
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2) + 4));
+      // B11[4][2] B11[5][2] B11[6][2] B11[7][2]
+
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+      ymm8 = _mm256_fmsub_pd(ymm1, ymm15, ymm8);
+      // B11[4-7][2] * alpha -= ymm5
+
+      /// implement TRSM///
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+      ymm6 = _mm256_fnmadd_pd(ymm1, ymm8, ymm6);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+      ymm4 = _mm256_fnmadd_pd(ymm1, ymm8, ymm4);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+      ymm6 = DTRSM_SMALL_DIV_OR_SCALE(ymm6, ymm0);
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      //(Row 1): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+      ymm4 = _mm256_fnmadd_pd(ymm1, ymm6, ymm4);
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+      ymm4 = DTRSM_SMALL_DIV_OR_SCALE(ymm4, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4), ymm4);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + cs_b + 4), ymm6);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2)), ymm7);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2) + 4), ymm8);
+    }
+
+    dim_t m_remainder = i + d_mr;
+    if (m_remainder >= 4)
+    {
+      a01 = D_A_pack;
+      a11 = L; // pointer to block of A to be used for TRSM
+      b10 = B + (m_remainder - 4) + 3*cs_b;        // pointer to block of B to be used in GEMM
+      b11 = B + (m_remainder - 4);   // pointer to block of B to be used for TRSM
+
+      k_iter = (n - 3); // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_3nx4m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm7 = _mm256_fmsub_pd(ymm0, ymm15, ymm7);
+      // B11[0-3][2] * alpha -= ymm4
+
+      /// implement TRSM///
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+      ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(row 2):FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+      ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      //(Row 1): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + (cs_b * 2)), ymm7);
+
+      m_remainder -= 4;
+    }
+
+    if (m_remainder)
+    {
+      if (m_remainder == 3)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + 3*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - 3); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_3nx3m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_3N_3M(AlphaVal, b11, cs_b)
+
+        /// implement TRSM///
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+        ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+        //(row 2):FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        BLIS_POST_DTRSM_SMALL_3N_3M(b11, cs_b)
+
+        m_remainder -= 3;
+      }
+      else if (m_remainder == 2)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + 3*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - 3); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_3nx2m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_3N_2M(AlphaVal, b11, cs_b)
+
+        /// implement TRSM///
+
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+        ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+        //(row 2):FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        BLIS_POST_DTRSM_SMALL_3N_2M(b11, cs_b)
+
+        m_remainder -= 2;
+      }
+      else if (m_remainder == 1)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + 3*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - 3); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_3nx1m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_3N_1M(AlphaVal, b11, cs_b)
+
+        /// implement TRSM///
+
+        // extract a22
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+        ymm7 = DTRSM_SMALL_DIV_OR_SCALE(ymm7, ymm0);
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+        //(row 2):FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a) + (1 * rs_a)));
+        ymm5 = _mm256_fnmadd_pd(ymm1, ymm7, ymm5);
+
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (2 * cs_a)));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm7, ymm3);
+
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        BLIS_POST_DTRSM_SMALL_3N_1M(b11, cs_b)
+
+        m_remainder -= 1;
+      }
+    }
+    n_remainder -= 3;
+  }
+else if ( n_remainder == 2)
+  {
+    a01 = L + 2*cs_a;     // pointer to block of A to be used in GEMM
+    a11 = L; // pointer to block of A to be used for TRSM
+
+    double *ptr_a10_dup = D_A_pack;
+
+    dim_t p_lda = (n - 2); // packed leading dimension
+    // perform copy of A to packed buffer D_A_pack
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < p_lda; x += 1)
+      {
+        bli_dcopys(*(a01 + rs_a * 0), *(ptr_a10_dup + (p_lda * 0)));
+        bli_dcopys(*(a01 + rs_a * 1), *(ptr_a10_dup + (p_lda * 1)));
+        ptr_a10_dup += 1;
+        a01 += cs_a;
+      }
+    }
+    else
+    {
+      dim_t loop_count = (n - 2) / 4;
+
+      for (dim_t x = 0; x < loop_count; x++)
+      {
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 0) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (x * 4)), ymm15);
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 1) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (x * 4)), ymm15);
+      }
+
+      dim_t remainder_loop_count = p_lda - loop_count * 4;
+
+      __m128d xmm0;
+      if (remainder_loop_count != 0)
+      {
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 0) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (loop_count * 4)), xmm0);
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 1) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 1) + (loop_count * 4)), xmm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      if (transa)
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 1) + 1));
+      }
+      else
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + (rs_a * 1) + 1));
+      }
+      ymm2 = _mm256_broadcast_sd((double const *)&ones);
+      ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (i = (m - d_mr); (i + 1) > 0; i -= d_mr) // loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L; // pointer to block of A to be used for TRSM
+      b10 = B + i + 2*cs_b;                // pointer to block of B to be used in GEMM
+      b11 = B + i;            // pointer to block of B to be used for TRSM
+
+      k_iter = (n - 2); // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_N_REM
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_2nx8m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + 4));
+      // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+      ymm4 = _mm256_fmsub_pd(ymm1, ymm15, ymm4);
+      // B11[4-7][0] * alpha-= ymm1
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b + 4));
+      // B11[4][1] B11[5][1] B11[6][1] B11[7][1]
+
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+      ymm6 = _mm256_fmsub_pd(ymm1, ymm15, ymm6);
+      // B11[4-7][1] * alpha -= ymm3
+
+      /// implement TRSM///
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+      ymm6 = DTRSM_SMALL_DIV_OR_SCALE(ymm6, ymm0);
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      //(Row 1): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+      ymm4 = _mm256_fnmadd_pd(ymm1, ymm6, ymm4);
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+      ymm4 = DTRSM_SMALL_DIV_OR_SCALE(ymm4, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4), ymm4);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + cs_b + 4), ymm6);
+    }
+
+    dim_t m_remainder = i + d_mr;
+    if (m_remainder >= 4)
+    {
+      a01 = D_A_pack;
+      a11 = L; // pointer to block of A to be used for TRSM
+      b10 = B + (m_remainder - 4) + 2*cs_b;        // pointer to block of B to be used in GEMM
+      b11 = B + (m_remainder - 4);    // pointer to block of B to be used for TRSM
+
+      k_iter = (n - 2); // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      ymm3 = _mm256_setzero_pd();
+      ymm5 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_2nx4m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);
+      // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm5 = _mm256_fmsub_pd(ymm0, ymm15, ymm5);
+      // B11[0-3][1] * alpha-= ymm2
+
+      /// implement TRSM///
+
+      // extract a11
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+      ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      //(Row 1): FMA operations
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+      ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + cs_b), ymm5);
+
+      m_remainder -= 4;
+    }
+
+    if (m_remainder)
+    {
+      if (m_remainder == 3)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + 2*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - 2); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        ymm3 = _mm256_setzero_pd();
+        ymm5 = _mm256_setzero_pd();
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_2nx3m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_2N_3M(AlphaVal, b11, cs_b)
+
+        /// implement TRSM///
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        BLIS_POST_DTRSM_SMALL_2N_3M(b11, cs_b)
+
+        m_remainder -= 3;
+      }
+      else if (m_remainder == 2)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + (2)*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;     // pointer to block of B to be used for TRSM
+
+        k_iter = (n -  2); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        ymm3 = _mm256_setzero_pd();
+        ymm5 = _mm256_setzero_pd();
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_2nx2m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_2N_2M(AlphaVal, b11, cs_b)
+        /// implement TRSM///
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        BLIS_POST_DTRSM_SMALL_2N_2M(b11, cs_b)
+
+        m_remainder -= 2;
+      }
+      else if (m_remainder == 1)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + 2*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - 2); // number of GEMM operations to be done(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        ymm3 = _mm256_setzero_pd();
+        ymm5 = _mm256_setzero_pd();
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_2nx1m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_2N_1M(AlphaVal, b11, cs_b)
+        /// implement TRSM///
+
+        // extract a11
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+        ymm5 = DTRSM_SMALL_DIV_OR_SCALE(ymm5, ymm0);
+
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+        //(Row 1): FMA operations
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a));
+        ymm3 = _mm256_fnmadd_pd(ymm1, ymm5, ymm3);
+
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        BLIS_POST_DTRSM_SMALL_2N_1M(b11, cs_b)
+
+        m_remainder -= 1;
+      }
+    }
+     n_remainder -= 2;
+  }
+  else if ( n_remainder == 1)
+  {
+    a01 = L + 1 * cs_a;     // pointer to block of A to be used in GEMM
+    a11 = L; // pointer to block of A to be used for TRSM
+
+    double *ptr_a10_dup = D_A_pack;
+
+    dim_t p_lda = (n - 1); // packed leading dimension
+    // perform copy of A to packed buffer D_A_pack
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < p_lda; x += 1)
+      {
+        bli_dcopys(*(a01), *(ptr_a10_dup));
+        ptr_a10_dup += 1;
+        a01 += cs_a;
+      }
+    }
+    else
+    {
+      dim_t loop_count = (n - 1) / 4;
+      for (dim_t x = 0; x < loop_count; x++)
+      {
+        ymm15 = _mm256_loadu_pd((double const *)(a01 + (rs_a * 0) + (x * 4)));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (x * 4)), ymm15);
+      }
+
+      dim_t remainder_loop_count = p_lda - loop_count * 4;
+
+      __m128d xmm0;
+      if (remainder_loop_count != 0)
+      {
+        xmm0 = _mm_loadu_pd((double const *)(a01 + (rs_a * 0) + (loop_count * 4)));
+        _mm_storeu_pd((double *)(ptr_a10_dup + (p_lda * 0) + (loop_count * 4)), xmm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      // broadcast diagonal elements of A11
+      ymm0 = _mm256_broadcast_sd((double const *)(a11));
+      ymm1 = _mm256_broadcast_sd((double const *)&ones);
+      ymm2 = _mm256_broadcast_sd((double const *)&ones);
+      ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (i = (m - d_mr); (i + 1) > 0; i -= d_mr) // loop along 'M' direction
+    {
+      a01 = D_A_pack;
+      a11 = L; // pointer to block of A to be used for TRSM
+      b10 = B + i + 1*cs_b;                // pointer to block of B to be used in GEMM
+      b11 = B + i;            // pointer to block of B to be used for TRSM
+
+      k_iter = (n - 1); // number of GEMM operations to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      ymm3 = _mm256_setzero_pd();
+      ymm4 = _mm256_setzero_pd();
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_1nx8m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal);
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + 4));
+      // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+      ymm4 = _mm256_fmsub_pd(ymm1, ymm15, ymm4);
+      // B11[4-7][0] * alpha-= ymm1
+
+      /// implement TRSM///
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+      ymm4 = DTRSM_SMALL_DIV_OR_SCALE(ymm4, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4), ymm4);
+    }
+
+    dim_t m_remainder = i + d_mr;
+    if (m_remainder >= 4)
+    {
+      a01 = D_A_pack;
+      a11 = L; // pointer to block of A to be used for TRSM
+      b10 = B + (m_remainder - 4) + 1*cs_b;        // pointer to block of B to be used in GEMM
+      b11 = B + (m_remainder - 4);    // pointer to block of B to be used for TRSM
+
+      k_iter = (n - 1); // number of GEMM operations to be done(in blocks of 4x4)
+
+      ymm3 = _mm256_setzero_pd();
+
+      /// GEMM implementation starts///
+      BLIS_DTRSM_SMALL_GEMM_1nx4m(a01, b10, cs_b, p_lda, k_iter)
+
+      ymm15 = _mm256_broadcast_sd((double const *)&AlphaVal); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)b11);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm3 = _mm256_fmsub_pd(ymm0, ymm15, ymm3);
+      // B11[0-3][0] * alpha -= ymm0
+
+      /// implement TRSM///
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+      ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+      _mm256_storeu_pd((double *)b11, ymm3);
+
+      m_remainder -= 4;
+    }
+
+    if (m_remainder)
+    {
+      if (m_remainder == 3)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + 1*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;   // pointer to block of B to be used for TRSM
+
+        k_iter = (n -  1); // number of GEMM operations to be done(in blocks of 4x4)
+
+        ymm3 = _mm256_setzero_pd();
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_1nx3m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_1N_3M(AlphaVal, b11, cs_b)
+
+        /// implement TRSM///
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        ymm0 = _mm256_loadu_pd((double const *)b11);
+        ymm3 = _mm256_blend_pd(ymm6, ymm3, 0x07);
+
+        BLIS_POST_DTRSM_SMALL_1N_3M(b11, cs_b)
+
+        m_remainder -= 3;
+      }
+      else if (m_remainder == 2)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + 1*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n -  1); // number of GEMM operations to be done(in blocks of 4x4)
+
+        ymm3 = _mm256_setzero_pd();
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_1nx2m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_1N_2M(AlphaVal, b11, cs_b)
+
+        /// implement TRSM///
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        BLIS_POST_DTRSM_SMALL_1N_2M(b11, cs_b)
+
+        m_remainder -= 2;
+      }
+      else if (m_remainder == 1)
+      {
+        a01 = D_A_pack;
+        a11 = L; // pointer to block of A to be used for TRSM
+        b10 = B + 1*cs_b;        // pointer to block of B to be used in GEMM
+        b11 = B;    // pointer to block of B to be used for TRSM
+
+        k_iter = (n - 1); // number of GEMM operations to be done(in blocks of 4x4)
+
+        ymm3 = _mm256_setzero_pd();
+
+        /// GEMM implementation starts///
+        BLIS_DTRSM_SMALL_GEMM_1nx1m(a01, b10, cs_b, p_lda, k_iter)
+
+        BLIS_PRE_DTRSM_SMALL_1N_1M(AlphaVal, b11, cs_b)
+
+        /// implement TRSM///
+        // extract a00
+        ymm0 = _mm256_broadcast_sd((double const *)(d11_pack));
+        ymm3 = DTRSM_SMALL_DIV_OR_SCALE(ymm3, ymm0);
+
+        BLIS_POST_DTRSM_SMALL_1N_1M(b11, cs_b)
+
+      }
+    }
+  }
+
+  if ((required_packing_A) && bli_mem_is_alloc(&local_mem_buf_A_s))
+  {
+    bli_membrk_release(&rntm,
+               &local_mem_buf_A_s);
+  }
+  return BLIS_SUCCESS;
+}
+
+// region - 8x8 transpose for left variants
+#define BLIS_DTRSM_SMALL_NREG_TRANSPOSE_8x8(b11, cs_b, AlphaVal) \
+  zmm8 = _mm512_set1_pd(AlphaVal); \
+  zmm0 = _mm512_loadu_pd((double const *)b11 + (cs_b * 0)); \
+  zmm1 = _mm512_loadu_pd((double const *)b11 + (cs_b * 1)); \
+  zmm2 = _mm512_loadu_pd((double const *)b11 + (cs_b * 2)); \
+  zmm3 = _mm512_loadu_pd((double const *)b11 + (cs_b * 3)); \
+  zmm0 = _mm512_fmsub_pd(zmm0, zmm8, zmm9); \
+  zmm1 = _mm512_fmsub_pd(zmm1, zmm8, zmm10); \
+  zmm2 = _mm512_fmsub_pd(zmm2, zmm8, zmm11); \
+  zmm3 = _mm512_fmsub_pd(zmm3, zmm8, zmm12); \
+  \
+  zmm4 = _mm512_loadu_pd((double const *)b11 + (cs_b * 4)); \
+  zmm5 = _mm512_loadu_pd((double const *)b11 + (cs_b * 5)); \
+  zmm6 = _mm512_loadu_pd((double const *)b11 + (cs_b * 6)); \
+  zmm7 = _mm512_loadu_pd((double const *)b11 + (cs_b * 7)); \
+  zmm4 = _mm512_fmsub_pd(zmm4, zmm8, zmm13); \
+  zmm5 = _mm512_fmsub_pd(zmm5, zmm8, zmm14); \
+  zmm6 = _mm512_fmsub_pd(zmm6, zmm8, zmm15); \
+  zmm7 = _mm512_fmsub_pd(zmm7, zmm8, zmm16); \
+  /*Stage1*/ \
+  zmm17 = _mm512_unpacklo_pd(zmm0, zmm1); \
+  zmm18 = _mm512_unpacklo_pd(zmm2, zmm3); \
+  zmm19 = _mm512_unpacklo_pd(zmm4, zmm5); \
+  zmm20 = _mm512_unpacklo_pd(zmm6, zmm7); \
+  /*Stage2*/ \
+  zmm21 = _mm512_shuffle_f64x2(zmm17, zmm18, 0b10001000); \
+  zmm22 = _mm512_shuffle_f64x2(zmm19, zmm20, 0b10001000); \
+  /*Stage3  1,5*/ \
+  zmm9 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b10001000); \
+  zmm13 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b11011101); \
+  /*Stage2*/ \
+  zmm21 = _mm512_shuffle_f64x2(zmm17, zmm18, 0b11011101); \
+  zmm22 = _mm512_shuffle_f64x2(zmm19, zmm20, 0b11011101); \
+  /*Stage3  3,7*/ \
+  zmm11 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b10001000); \
+  zmm15 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b11011101); \
+  /*Stage1*/ \
+  zmm17 = _mm512_unpackhi_pd(zmm0, zmm1); \
+  zmm18 = _mm512_unpackhi_pd(zmm2, zmm3); \
+  zmm19 = _mm512_unpackhi_pd(zmm4, zmm5); \
+  zmm20 = _mm512_unpackhi_pd(zmm6, zmm7); \
+  /*Stage2*/ \
+  zmm21 = _mm512_shuffle_f64x2(zmm17, zmm18, 0b10001000); \
+  zmm22 = _mm512_shuffle_f64x2(zmm19, zmm20, 0b10001000); \
+  /*Stage3  2,6*/ \
+  zmm10 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b10001000); \
+  zmm14 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b11011101); \
+  /*Stage2*/ \
+  zmm21 = _mm512_shuffle_f64x2(zmm17, zmm18, 0b11011101); \
+  zmm22 = _mm512_shuffle_f64x2(zmm19, zmm20, 0b11011101); \
+  /*Stage3  4,8*/ \
+  zmm12 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b10001000); \
+  zmm16 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b11011101);
+
+#define BLIS_DTRSM_SMALL_NREG_TRANSPOSE_4x8(b11, cs_b, AlphaVal) \
+  ymm8 = _mm256_broadcast_sd((double const *)(&AlphaVal));\
+  \
+  ymm0 = _mm256_loadu_pd((double const *)(b11));\
+  ymm1 = _mm256_loadu_pd((double const *)(b11 + (cs_b *1))); \
+  ymm2 = _mm256_loadu_pd((double const *)(b11 + (cs_b *2))); \
+  ymm3 = _mm256_loadu_pd((double const *)(b11 + (cs_b *3))); \
+  ymm0 = _mm256_fmsub_pd(ymm0, ymm8, ymm9); \
+  ymm1 = _mm256_fmsub_pd(ymm1, ymm8, ymm10); \
+  ymm2 = _mm256_fmsub_pd(ymm2, ymm8, ymm11); \
+  ymm3 = _mm256_fmsub_pd(ymm3, ymm8, ymm12); \
+  \
+  ymm10 = _mm256_unpacklo_pd(ymm0, ymm1); \
+  ymm12 = _mm256_unpacklo_pd(ymm2, ymm3); \
+  ymm9 = _mm256_permute2f128_pd(ymm10,ymm12,0x20); \
+  ymm11 = _mm256_permute2f128_pd(ymm10,ymm12,0x31); \
+  ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); \
+  ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); \
+  ymm10 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); \
+  ymm12 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); \
+  \
+  ymm8 = _mm256_broadcast_sd((double const *)(&AlphaVal)); \
+  ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b *4))); \
+  ymm1 = _mm256_loadu_pd((double const *)(b11 + (cs_b *5))); \
+  ymm2 = _mm256_loadu_pd((double const *)(b11 + (cs_b *6))); \
+  ymm3 = _mm256_loadu_pd((double const *)(b11 + (cs_b *7))); \
+  ymm0 = _mm256_fmsub_pd(ymm0, ymm8, ymm13); \
+  ymm1 = _mm256_fmsub_pd(ymm1, ymm8, ymm14); \
+  ymm2 = _mm256_fmsub_pd(ymm2, ymm8, ymm15); \
+  ymm3 = _mm256_fmsub_pd(ymm3, ymm8, ymm16); \
+  \
+  ymm14 = _mm256_unpacklo_pd(ymm0, ymm1); \
+  ymm16 = _mm256_unpacklo_pd(ymm2, ymm3); \
+  ymm13 = _mm256_permute2f128_pd(ymm14,ymm16,0x20); \
+  ymm15 = _mm256_permute2f128_pd(ymm14,ymm16,0x31); \
+  ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); \
+  ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); \
+  ymm14 = _mm256_permute2f128_pd(ymm0,ymm1,0x20); \
+  ymm16 = _mm256_permute2f128_pd(ymm0,ymm1,0x31); \
+
+#define BLIS_DTRSM_SMALL_NREG_TRANSPOSE_4x8_AND_STORE(b11, cs_b) \
+  ymm1 = _mm256_unpacklo_pd(ymm9, ymm10); \
+  ymm3 = _mm256_unpacklo_pd(ymm11, ymm12); \
+  \
+    /*rearrange low elements*/\
+  ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); \
+  ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); \
+  \
+  /*unpack high*/\
+  ymm9 = _mm256_unpackhi_pd(ymm9, ymm10); \
+  ymm10 = _mm256_unpackhi_pd(ymm11, ymm12); \
+  \
+  /*rearrange high elements*/\
+  ymm1 = _mm256_permute2f128_pd(ymm9, ymm10, 0x20); \
+  ymm3 = _mm256_permute2f128_pd(ymm9, ymm10, 0x31); \
+  \
+  /*unpacklow*/\
+  ymm5 = _mm256_unpacklo_pd(ymm13, ymm14); \
+  ymm7 = _mm256_unpacklo_pd(ymm15, ymm16); \
+  \
+  /*rearrange low elements*/\
+  ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20); \
+  ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31); \
+  \
+  /*unpack high*/\
+  ymm13 = _mm256_unpackhi_pd(ymm13, ymm14); \
+  ymm14 = _mm256_unpackhi_pd(ymm15, ymm16); \
+  \
+  /*rearrange high elements*/\
+  ymm5 = _mm256_permute2f128_pd(ymm13, ymm14, 0x20); \
+  ymm7 = _mm256_permute2f128_pd(ymm13, ymm14, 0x31); \
+
+/*
+zmm9 [0] zmm9 [1] zmm9 [2] zmm9 [3] zmm9 [4] zmm9 [5] zmm9 [6] zmm9 [7]
+zmm10[0] zmm10[1] zmm10[2] zmm10[3] zmm10[4] zmm10[5] zmm10[6] zmm10[7]
+zmm11[0] zmm11[1] zmm11[2] zmm11[3] zmm11[4] zmm11[5] zmm11[6] zmm11[7]
+zmm12[0] zmm12[1] zmm12[2] zmm12[3] zmm12[4] zmm12[5] zmm12[6] zmm12[7]
+zmm13[0] zmm13[1] zmm13[2] zmm13[3] zmm13[4] zmm13[5] zmm13[6] zmm13[7]
+zmm14[0] zmm14[1] zmm14[2] zmm14[3] zmm14[4] zmm14[5] zmm14[6] zmm14[7]
+zmm15[0] zmm15[1] zmm15[2] zmm15[3] zmm15[4] zmm15[5] zmm15[6] zmm15[7]
+zmm16[0] zmm16[1] zmm16[2] zmm16[3] zmm16[4] zmm16[5] zmm16[6] zmm16[7]
+
+Stage1
+zmm17 = zmm10[1] zmm9 [1] zmm10[3] zmm9 [3] zmm10[5] zmm9 [5] zmm10[7] zmm9 [7]
+zmm18 = zmm12[1] zmm11[1] zmm12[3] zmm11[3] zmm12[5] zmm11[5] zmm12[7] zmm11[7]
+zmm19 = zmm14[1] zmm13[1] zmm14[3] zmm13[3] zmm14[5] zmm13[5] zmm14[7] zmm13[7]
+zmm20 = zmm16[1] zmm15[1] zmm16[3] zmm15[3] zmm16[5] zmm15[5] zmm16[7] zmm15[7]
+
+Stage2
+zmm21 = zmm12[3] zmm11[3] zmm12[7] zmm11[7] zmm10[3] zmm9 [3] zmm10[7] zmm9 [7]
+zmm22 = zmm16[3] zmm15[3] zmm16[7] zmm15[7] zmm14[3] zmm13[3] zmm14[7] zmm13[7]
+
+Stage3  1,5
+zmm0 = zmm16[7] zmm15[7] zmm14[7] zmm13[7] zmm12[7] zmm11[7] zmm10[7] zmm9 [7]
+zmm4 = zmm16[3] zmm15[3] zmm14[3] zmm13[3] zmm12[3] zmm11[3] zmm10[3] zmm9 [3]
+
+Stage2
+zmm21 = zmm12[1] zmm11[1] zmm12[5] zmm11[5] zmm10[1] zmm9 [1] zmm10[5] zmm9 [5]
+zmm22 = zmm16[1] zmm15[1] zmm16[5] zmm15[5] zmm14[1] zmm13[1] zmm14[5] zmm13[5]
+
+Stage3  3,7
+zmm2 = zmm16[5] zmm15[5] zmm14[5] zmm13[5] zmm12[5] zmm11[5] zmm10[5] zmm9 [5]
+zmm6 = zmm16[1] zmm15[1] zmm14[1] zmm13[1] zmm12[1] zmm11[1] zmm10[1] zmm9 [1]
+
+Stage1
+zmm17 = zmm10[0] zmm9 [0] zmm10[2] zmm9 [2] zmm10[4] zmm9 [4] zmm10[6] zmm9 [6]
+zmm18 = zmm12[0] zmm11[0] zmm12[2] zmm11[2] zmm12[4] zmm11[4] zmm12[6] zmm11[6]
+zmm19 = zmm14[0] zmm13[0] zmm14[2] zmm13[2] zmm14[4] zmm13[4] zmm14[6] zmm13[6]
+zmm20 = zmm16[0] zmm15[0] zmm16[2] zmm15[2] zmm16[4] zmm15[4] zmm16[6] zmm15[6]
+
+Stage2
+zmm21 = zmm12[2] zmm11[2] zmm12[6] zmm11[6] zmm10[2] zmm9 [2] zmm10[6] zmm9 [6]
+zmm22 = zmm16[2] zmm15[2] zmm16[6] zmm15[6] zmm14[2] zmm13[2] zmm14[6] zmm13[6]
+
+Stage3  2,6
+zmm1 = zmm16[6] zmm15[6] zmm14[6] zmm13[6] zmm12[6] zmm11[6] zmm10[6] zmm9 [6]
+zmm5 = zmm16[2] zmm15[2] zmm14[2] zmm13[2] zmm12[2] zmm11[2] zmm10[2] zmm9 [2]
+
+Stage2
+zmm21 = zmm12[0] zmm11[0] zmm12[4] zmm11[4] zmm10[0] zmm9 [0] zmm10[4] zmm9 [4]
+zmm22 = zmm16[0] zmm15[0] zmm16[4] zmm15[4] zmm14[0] zmm13[0] zmm14[4] zmm13[4]
+
+Stage3  4,8
+zmm3 = zmm16[4] zmm15[4] zmm14[4] zmm13[4] zmm12[4] zmm11[4] zmm10[4] zmm9 [4]
+zmm7 = zmm16[0] zmm15[0] zmm14[0] zmm13[0] zmm12[0] zmm11[0] zmm10[0] zmm9 [0]
+*/
+#define BLIS_DTRSM_SMALL_NREG_TRANSPOSE_8x8_AND_STORE(b11, cs_b) \
+  /*Stage1*/ \
+  zmm17 = _mm512_unpacklo_pd(zmm9, zmm10); \
+  zmm18 = _mm512_unpacklo_pd(zmm11, zmm12); \
+  zmm19 = _mm512_unpacklo_pd(zmm13, zmm14); \
+  zmm20 = _mm512_unpacklo_pd(zmm15, zmm16); \
+  /*Stage2*/ \
+  zmm21 = _mm512_shuffle_f64x2(zmm17, zmm18, 0b10001000); \
+  zmm22 = _mm512_shuffle_f64x2(zmm19, zmm20, 0b10001000); \
+  /*Stage3  1,5*/ \
+  zmm0 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b10001000); \
+  zmm4 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b11011101); \
+  /*Stage2*/ \
+  zmm21 = _mm512_shuffle_f64x2(zmm17, zmm18, 0b11011101); \
+  zmm22 = _mm512_shuffle_f64x2(zmm19, zmm20, 0b11011101); \
+  /*Stage3  3,7*/ \
+  zmm2 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b10001000); \
+  zmm6 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b11011101); \
+  /*Stage1*/ \
+  zmm17 = _mm512_unpackhi_pd(zmm9, zmm10); \
+  zmm18 = _mm512_unpackhi_pd(zmm11, zmm12); \
+  zmm19 = _mm512_unpackhi_pd(zmm13, zmm14); \
+  zmm20 = _mm512_unpackhi_pd(zmm15, zmm16); \
+  /*Stage2*/ \
+  zmm21 = _mm512_shuffle_f64x2(zmm17, zmm18, 0b10001000); \
+  zmm22 = _mm512_shuffle_f64x2(zmm19, zmm20, 0b10001000); \
+  /*Stage3  2,6*/ \
+  zmm1 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b10001000); \
+  zmm5 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b11011101); \
+  /*Stage2*/ \
+  zmm21 = _mm512_shuffle_f64x2(zmm17, zmm18, 0b11011101); \
+  zmm22 = _mm512_shuffle_f64x2(zmm19, zmm20, 0b11011101); \
+  /*Stage3  4,8*/ \
+  zmm3 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b10001000); \
+  zmm7 = _mm512_shuffle_f64x2(zmm21, zmm22, 0b11011101);
+
+// endregion - 8x8 transpose for left variants
+
+// region - GEMM DTRSM for left variants
+
+#define BLIS_DTRSM_SMALL_GEMM_8mx8n_AVX512(a10, b01, cs_b, p_lda, k_iter, b11) \
+  /*k_iter -= 8; */ \
+  int itrCount = (k_iter / 2); \
+  int itr = itrCount; \
+  int itr2 = k_iter - itrCount; \
+  double *b01_2 = b01 + itr; \
+  double *a10_2 = a10 + (p_lda * itr); \
+  for (; itr > 0; itr--) \
+  { \
+    zmm0 = _mm512_loadu_pd((double const *)a10); \
+    \
+    zmm1 = _mm512_set1_pd(*(b01 + cs_b * 0)); \
+    zmm2 = _mm512_set1_pd(*(b01 + cs_b * 1)); \
+    zmm3 = _mm512_set1_pd(*(b01 + cs_b * 2)); \
+    zmm4 = _mm512_set1_pd(*(b01 + cs_b * 3)); \
+    zmm5 = _mm512_set1_pd(*(b01 + cs_b * 4)); \
+    zmm6 = _mm512_set1_pd(*(b01 + cs_b * 5)); \
+    zmm7 = _mm512_set1_pd(*(b01 + cs_b * 6)); \
+    zmm8 = _mm512_set1_pd(*(b01 + cs_b * 7)); \
+    \
+    _mm_prefetch((b01 + 8), _MM_HINT_T0); \
+    zmm9 = _mm512_fmadd_pd(zmm1, zmm0, zmm9); \
+    zmm10 = _mm512_fmadd_pd(zmm2, zmm0, zmm10); \
+    zmm11 = _mm512_fmadd_pd(zmm3, zmm0, zmm11); \
+    zmm12 = _mm512_fmadd_pd(zmm4, zmm0, zmm12); \
+    zmm13 = _mm512_fmadd_pd(zmm5, zmm0, zmm13); \
+    zmm14 = _mm512_fmadd_pd(zmm6, zmm0, zmm14); \
+    zmm15 = _mm512_fmadd_pd(zmm7, zmm0, zmm15); \
+    zmm16 = _mm512_fmadd_pd(zmm8, zmm0, zmm16); \
+    \
+    b01 += 1; \
+    a10 += p_lda; \
+  } \
+  for (; itr2 > 0; itr2--) \
+  { \
+    zmm23 = _mm512_loadu_pd((double const *)a10_2); \
+    \
+    zmm17 = _mm512_set1_pd(*(b01_2 + cs_b * 0)); \
+    zmm18 = _mm512_set1_pd(*(b01_2 + cs_b * 1)); \
+    zmm19 = _mm512_set1_pd(*(b01_2 + cs_b * 2)); \
+    zmm20 = _mm512_set1_pd(*(b01_2 + cs_b * 3)); \
+    zmm21 = _mm512_set1_pd(*(b01_2 + cs_b * 4)); \
+    zmm22 = _mm512_set1_pd(*(b01_2 + cs_b * 5)); \
+    \
+    _mm_prefetch((b01_2 + 8), _MM_HINT_T0); \
+    zmm24 = _mm512_fmadd_pd(zmm17, zmm23, zmm24); \
+    zmm17 = _mm512_set1_pd(*(b01_2 + cs_b * 6)); \
+    zmm25 = _mm512_fmadd_pd(zmm18, zmm23, zmm25); \
+    zmm18 = _mm512_set1_pd(*(b01_2 + cs_b * 7)); \
+    zmm26 = _mm512_fmadd_pd(zmm19, zmm23, zmm26); \
+    zmm27 = _mm512_fmadd_pd(zmm20, zmm23, zmm27); \
+    zmm28 = _mm512_fmadd_pd(zmm21, zmm23, zmm28); \
+    zmm29 = _mm512_fmadd_pd(zmm22, zmm23, zmm29); \
+    zmm30 = _mm512_fmadd_pd(zmm17, zmm23, zmm30); \
+    zmm31 = _mm512_fmadd_pd(zmm18, zmm23, zmm31); \
+    \
+    b01_2 += 1; \
+    a10_2 += p_lda; \
+  } \
+  _mm_prefetch((b11 + (0) * cs_b), _MM_HINT_T0); \
+  zmm9 = _mm512_add_pd(zmm9, zmm24); \
+  _mm_prefetch((b11 + (1) * cs_b), _MM_HINT_T0); \
+  zmm10 = _mm512_add_pd(zmm10, zmm25); \
+  _mm_prefetch((b11 + (2) * cs_b), _MM_HINT_T0); \
+  zmm11 = _mm512_add_pd(zmm11, zmm26); \
+  _mm_prefetch((b11 + (3) * cs_b), _MM_HINT_T0); \
+  zmm12 = _mm512_add_pd(zmm12, zmm27); \
+  _mm_prefetch((b11 + (4) * cs_b), _MM_HINT_T0); \
+  zmm13 = _mm512_add_pd(zmm13, zmm28); \
+  _mm_prefetch((b11 + (5) * cs_b), _MM_HINT_T0); \
+  zmm14 = _mm512_add_pd(zmm14, zmm29); \
+  _mm_prefetch((b11 + (6) * cs_b), _MM_HINT_T0); \
+  zmm15 = _mm512_add_pd(zmm15, zmm30); \
+  _mm_prefetch((b11 + (7) * cs_b), _MM_HINT_T0); \
+  zmm16 = _mm512_add_pd(zmm16, zmm31);
+
+#define BLIS_DTRSM_SMALL_GEMM_8mx4n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    ymm1 = _mm256_loadu_pd((double const *)(a10 + 4)); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01)); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm0, ymm8); \
+    ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 1))); \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 2))); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 3))); \
+    ymm11 = _mm256_fmadd_pd(ymm2, ymm0, ymm11); \
+    ymm15 = _mm256_fmadd_pd(ymm2, ymm1, ymm15); \
+    \
+    b01 += 1;   /*move to  next row of B*/ \
+    a10 += p_lda; /*pointer math to calculate next block of A for GEMM*/ \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_8mx3n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    ymm1 = _mm256_loadu_pd((double const *)(a10 + 4)); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 0))); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm0, ymm8); \
+    ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 1))); \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 2))); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    ymm14 = _mm256_fmadd_pd(ymm2, ymm1, ymm14); \
+    \
+    b01 += 1;   /*move to  next row of B*/ \
+    a10 += p_lda; /*pointer math to calculate next block of A for GEMM*/ \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_8mx2n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    ymm1 = _mm256_loadu_pd((double const *)(a10 + 4)); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 0))); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm0, ymm8); \
+    ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 1))); \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    ymm13 = _mm256_fmadd_pd(ymm2, ymm1, ymm13); \
+    \
+    b01 += 1;   /*move to  next row of B*/ \
+    a10 += p_lda; /*pointer math to calculate next block of A for GEMM*/ \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_8mx1n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    ymm1 = _mm256_loadu_pd((double const *)(a10 + 4)); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 0))); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm0, ymm8); \
+    ymm12 = _mm256_fmadd_pd(ymm2, ymm1, ymm12); \
+    b01 += 1;   /*move to  next row of B*/ \
+    a10 += p_lda; /*pointer math to calculate next block of A for GEMM*/ \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    \
+    ymm1 = _mm256_broadcast_sd((double const*)(b01 + (cs_b * 0))); \
+    ymm2 = _mm256_broadcast_sd((double const*)(b01 + (cs_b * 1))); \
+    ymm3 = _mm256_broadcast_sd((double const*)(b01 + (cs_b * 2))); \
+    ymm4 = _mm256_broadcast_sd((double const*)(b01 + (cs_b * 3))); \
+    ymm5 = _mm256_broadcast_sd((double const*)(b01 + (cs_b * 4))); \
+    ymm6 = _mm256_broadcast_sd((double const*)(b01 + (cs_b * 5))); \
+    ymm7 = _mm256_broadcast_sd((double const*)(b01 + (cs_b * 6))); \
+    ymm8 = _mm256_broadcast_sd((double const*)(b01 + (cs_b * 7))); \
+    \
+    _mm_prefetch((b01 + 4 * cs_b), _MM_HINT_T0); \
+    ymm9  = _mm256_fmadd_pd (ymm1, ymm0, ymm9); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    ymm11 = _mm256_fmadd_pd(ymm3, ymm0, ymm11); \
+    ymm12 = _mm256_fmadd_pd(ymm4, ymm0, ymm12); \
+    ymm13 = _mm256_fmadd_pd(ymm5, ymm0, ymm13); \
+    ymm14 = _mm256_fmadd_pd(ymm6, ymm0, ymm14); \
+    ymm15 = _mm256_fmadd_pd(ymm7, ymm0, ymm15); \
+    ymm16 = _mm256_fmadd_pd(ymm8, ymm0, ymm16); \
+    \
+    b01 += 1; \
+    a10 += p_lda; \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 0))); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm0, ymm8); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 1))); \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 2))); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 3))); \
+    ymm11 = _mm256_fmadd_pd(ymm2, ymm0, ymm11); \
+    \
+    b01 += 1;   /*move to  next row of B*/ \
+    a10 += p_lda; /*pointer math to calculate next block of A for GEMM*/ \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 0))); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm0, ymm8); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 1))); \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 2))); \
+    ymm10 = _mm256_fmadd_pd(ymm2, ymm0, ymm10); \
+    \
+    b01 += 1;   /*move to  next row of B*/ \
+    a10 += p_lda; /*pointer math to calculate next block of A for GEMM*/ \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 0))); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm0, ymm8); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 1))); \
+    ymm9 = _mm256_fmadd_pd(ymm2, ymm0, ymm9); \
+    \
+    b01 += 1;   /*move to  next row of B*/ \
+    a10 += p_lda; /*pointer math to calculate next block of A for GEMM*/ \
+  }
+
+#define BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter) \
+  for (k = 0; k < k_iter; k++) /*loop for number of GEMM operations*/ \
+  { \
+    ymm0 = _mm256_loadu_pd((double const *)(a10)); \
+    \
+    ymm2 = _mm256_broadcast_sd((double const *)(b01 + (cs_b * 0))); \
+    ymm8 = _mm256_fmadd_pd(ymm2, ymm0, ymm8); \
+    \
+    b01 += 1;   /*move to  next row of B*/ \
+    a10 += p_lda; /*pointer math to calculate next block of A for GEMM*/ \
+  }
+
+// endregion - GEMM DTRSM for left variants
+
+// region - pre/post DTRSM for left variants
+
+#define BLIS_PRE_DTRSM_SMALL_3M_3N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 0) + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1)); \
+  ymm1 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 1) + 2)); \
+  ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0); \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2)); \
+  ymm2 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 2) + 2)); \
+  ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0); \
+  \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); \
+  ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); \
+  \
+  xmm5 = _mm256_castpd256_pd128(ymm8); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 0), xmm5); \
+  _mm_storel_pd((b11 + cs_b * 0 + 2), _mm256_extractf128_pd(ymm8, 1)); \
+  xmm5 = _mm256_castpd256_pd128(ymm9); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 1), xmm5); \
+  _mm_storel_pd((b11 + cs_b * 1 + 2), _mm256_extractf128_pd(ymm9, 1)); \
+  xmm5 = _mm256_castpd256_pd128(ymm10); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 2), xmm5); \
+  _mm_storel_pd((b11 + cs_b * 2 + 2), _mm256_extractf128_pd(ymm10, 1));
+
+#define BLIS_PRE_DTRSM_SMALL_3M_2N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 0) + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1)); \
+  ymm1 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 1) + 2)); \
+  ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0); \
+  \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); \
+  \
+  xmm5 = _mm256_castpd256_pd128(ymm8); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 0), xmm5); \
+  _mm_storel_pd((b11 + cs_b * 0 + 2), _mm256_extractf128_pd(ymm8, 1)); \
+  xmm5 = _mm256_castpd256_pd128(ymm9); \
+  _mm_storeu_pd((double *)(b11 + cs_b * 1), xmm5); \
+  _mm_storel_pd((b11 + cs_b * 1 + 2), _mm256_extractf128_pd(ymm9, 1));
+
+#define BLIS_PRE_DTRSM_SMALL_3M_1N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0)); \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 0) + 2)); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  \
+  xmm5 = _mm256_castpd256_pd128(ymm8); \
+  _mm_storeu_pd((double *)(b11), xmm5); \
+  _mm_storel_pd((b11 + 2), _mm256_extractf128_pd(ymm8, 1));
+
+#define BLIS_PRE_DTRSM_SMALL_2M_3N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 0))); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 1))); \
+  ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0); \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 2))); \
+  ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0); \
+  \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); \
+  ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); \
+  \
+  xmm5 = _mm256_castpd256_pd128(ymm8); \
+  _mm_storeu_pd((double *)(b11 + (cs_b * 0)), xmm5); \
+  xmm5 = _mm256_castpd256_pd128(ymm9); \
+  _mm_storeu_pd((double *)(b11 + (cs_b * 1)), xmm5); \
+  xmm5 = _mm256_castpd256_pd128(ymm10); \
+  _mm_storeu_pd((double *)(b11 + (cs_b * 2)), xmm5);
+
+#define BLIS_PRE_DTRSM_SMALL_2M_2N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 0))); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 1))); \
+  ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0); \
+  \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); \
+  \
+  xmm5 = _mm256_castpd256_pd128(ymm8); \
+  _mm_storeu_pd((double *)(b11 + (cs_b * 0)), xmm5); \
+  xmm5 = _mm256_castpd256_pd128(ymm9); \
+  _mm_storeu_pd((double *)(b11 + (cs_b * 1)), xmm5);
+
+#define BLIS_PRE_DTRSM_SMALL_2M_1N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  xmm5 = _mm_loadu_pd((double const *)(b11 + (cs_b * 0))); \
+  ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0); \
+  \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  \
+  xmm5 = _mm256_castpd256_pd128(ymm8); \
+  _mm_storeu_pd((double *)(b11 + (cs_b * 0)), xmm5);
+
+#define BLIS_PRE_DTRSM_SMALL_1M_3N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 0))); \
+  ymm1 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 1))); \
+  ymm2 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 2))); \
+  \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); \
+  ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); \
+  \
+  _mm_storel_pd((double *)(b11), _mm256_extractf128_pd(ymm8, 0)); \
+  _mm_storel_pd((double *)(b11 + cs_b * 1), _mm256_extractf128_pd(ymm9, 0)); \
+  _mm_storel_pd((double *)(b11 + cs_b * 2), _mm256_extractf128_pd(ymm10, 0));
+
+#define BLIS_PRE_DTRSM_SMALL_1M_2N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 0))); \
+  ymm1 = _mm256_broadcast_sd((double const *)(b11 + (cs_b * 1))); \
+  \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); \
+  \
+  _mm_storel_pd((double *)(b11), _mm256_extractf128_pd(ymm8, 0)); \
+  _mm_storel_pd((double *)(b11 + cs_b * 1), _mm256_extractf128_pd(ymm9, 0));
+
+#define BLIS_PRE_DTRSM_SMALL_1M_1N(AlphaVal, b11, cs_b) \
+  ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); /*register to hold alpha*/ \
+  \
+  ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 0)); \
+  ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); \
+  \
+  _mm_storel_pd((double *)(b11), _mm256_extractf128_pd(ymm8, 0));
+
+// LLNN - LUTN
+BLIS_INLINE err_t bli_dtrsm_small_AutXB_AlXB_AVX512
+     (
+       obj_t*   AlphaObj,
+       obj_t*   a,
+       obj_t*   b,
+       cntx_t*  cntx,
+       cntl_t*  cntl
+     )
+{
+
+  dim_t m = bli_obj_length(b); //number of rows
+  dim_t n = bli_obj_width(b); // number of columns
+  bool transa = bli_obj_has_trans(a);
+  dim_t cs_a, rs_a;
+  dim_t d_mr = 8, d_nr = 8;
+
+  // Swap rs_a & cs_a in case of non-tranpose.
+  if (transa)
+  {
+    cs_a = bli_obj_col_stride(a); // column stride of A
+    rs_a = bli_obj_row_stride(a); // row stride of A
+  }
+  else
+  {
+    cs_a = bli_obj_row_stride(a); // row stride of A
+    rs_a = bli_obj_col_stride(a); // column stride of A
+  }
+  dim_t cs_b = bli_obj_col_stride(b); // column stride of B
+
+  dim_t i, j, k;
+  dim_t k_iter;
+
+  double AlphaVal = *(double *)AlphaObj->buffer;
+  double *L = bli_obj_buffer_at_off(a); // pointer to matrix A
+  double *B = bli_obj_buffer_at_off(b); // pointer to matrix B
+
+  double *a10, *a11, *b01, *b11; // pointers for GEMM and TRSM blocks
+
+
+  double ones = 1.0;
+
+  const gint_t required_packing_A = 1;
+  mem_t local_mem_buf_A_s = {0};
+  double *D_A_pack = NULL; // pointer to A01 pack buffer
+  double d11_pack[d_mr] __attribute__((aligned(64))); // buffer for diagonal A pack
+  rntm_t rntm;
+
+  bli_rntm_init_from_global(&rntm);
+  bli_rntm_set_num_threads_only(1, &rntm);
+  bli_membrk_rntm_set_membrk(&rntm);
+
+  siz_t buffer_size = bli_pool_block_size(
+    bli_membrk_pool(
+      bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK),
+      bli_rntm_membrk(&rntm)));
+
+  if ((d_mr * m * sizeof(double)) > buffer_size)
+    return BLIS_NOT_YET_IMPLEMENTED;
+
+  if (required_packing_A == 1)
+  {
+    // Get the buffer from the pool.
+    bli_membrk_acquire_m(&rntm,
+               buffer_size,
+               BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
+               &local_mem_buf_A_s);
+    if (FALSE == bli_mem_is_alloc(&local_mem_buf_A_s))
+      return BLIS_NULL_POINTER;
+    D_A_pack = bli_mem_buffer(&local_mem_buf_A_s);
+    if (NULL == D_A_pack)
+      return BLIS_NULL_POINTER;
+  }
+  bool is_unitdiag = bli_obj_has_unit_diag(a);
+  __m512d zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11;
+  __m512d zmm12, zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, zmm21;
+  __m512d zmm22, zmm23, zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
+  __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11;
+  __m256d ymm12, ymm13, ymm14, ymm15, ymm16;
+  __m128d xmm5;
+  xmm5 = _mm_setzero_pd();
+
+  //gcc12 throws a unitialized warning,
+  //To avoid that these variable are set to zero.
+  ymm0 = _mm256_setzero_pd();
+  ymm1 = _mm256_setzero_pd();
+  ymm2 = _mm256_setzero_pd();
+  ymm3 = _mm256_setzero_pd();
+  ymm4 = _mm256_setzero_pd();
+  ymm5 = _mm256_setzero_pd();
+  ymm6 = _mm256_setzero_pd();
+  ymm7 = _mm256_setzero_pd();
+
+  /*
+        Performs solving TRSM for 8 columns at a time from 0 to m/8 in steps of d_mr
+        a. Load, transpose, Pack A (a10 block), the size of packing 8x6 to 8x (m-8)
+           First there will be no GEMM and no packing of a10 because it is only TRSM
+        b. Using packed a10 block and b01 block perform GEMM operation
+        c. Use GEMM outputs, perform TRSM operaton using a11, b11 and update B
+        d. Repeat b,c for n rows of B in steps of d_nr
+    */
+  for (i = 0; (i + d_mr - 1) < m; i += d_mr)
+  {
+    a10 = L + (i * cs_a);
+    a11 = L + (i * rs_a) + (i * cs_a);
+
+    dim_t p_lda = d_mr;
+
+    if (transa)
+    {
+     /*
+      Pack current A block (a10) into packed buffer memory D_A_pack
+      a. This a10 block is used in GEMM portion only and this
+          a10 block size will be increasing by d_mr for every next iteration
+          until it reaches 8x(m-8) which is the maximum GEMM alone block size in A
+      b. This packed buffer is reused to calculate all n rows of B matrix
+    */
+      bli_dtrsm_small_pack_avx512('L', i, 1, a10, cs_a, D_A_pack, p_lda, d_mr);
+       /*
+        Pack 8 diagonal elements of A block into an array
+        a. This helps to utilize cache line efficiently in TRSM operation
+        b. store ones when input is unit diagonal
+      */
+      dtrsm_small_pack_diag_element_avx512(is_unitdiag, a11, cs_a, d11_pack, d_mr);
+    }
+    else
+    {
+      bli_dtrsm_small_pack_avx512('L', i, 0, a10, rs_a, D_A_pack, p_lda, d_mr);
+      dtrsm_small_pack_diag_element_avx512(is_unitdiag, a11, rs_a, d11_pack, d_mr);
+    }
+
+    /*
+      a. Perform GEMM using a10, b01.
+      b. Perform TRSM on a11, b11
+      c. This loop GEMM+TRSM loops operates with 8x8 block size
+          along n dimension for every d_nr columns of B01 where
+          packed A buffer is reused in computing all m cols of B.
+      d. Same approach is used in remaining fringe cases.
+    */
+
+    for (j = 0; j < n - d_nr + 1; j += d_nr)
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * rs_a) + (i * cs_a);    //pointer to block of A to be used for TRSM
+      b01 = B + j * cs_b;                   //pointer to block of B to be used in GEMM
+      b11 = B + i + j * cs_b;               //pointer to block of B to be used for TRSM
+      k_iter = i;
+
+      BLIS_SET_ZMM_REG_ZEROS
+      /*
+        Perform GEMM between a10 and b01 blocks
+        For first iteration there will be no GEMM operation
+        where k_iter are zero
+      */
+      BLIS_DTRSM_SMALL_GEMM_8mx8n_AVX512(a10, b01, cs_b, p_lda, k_iter, b11)
+      /*
+        Load b11 of size 8x8 and multiply with alpha
+        Add the GEMM output and perform in register transpose of b11
+        to perform TRSM operation.
+      */
+      BLIS_DTRSM_SMALL_NREG_TRANSPOSE_8x8(b11, cs_b, AlphaVal)
+
+      /*
+        Compute 8x8 TRSM block by using GEMM block output in register
+        a. The 8x8 input (gemm outputs) are stored in combinations of zmm registers
+            row      :   0     1    2      3     4     5     6     7
+            register : zmm9  zmm10 zmm11 zmm12 zmm13 zmm14 zmm15 zmm16
+        b. Towards the end TRSM output will be stored back into b11
+      */
+      // extract a00
+      zmm0 = _mm512_set1_pd(*(d11_pack + 0));
+      zmm9 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm9, zmm0);
+
+      // extract a11
+      zmm1  = _mm512_set1_pd(*(d11_pack + 1));
+      zmm2  = _mm512_set1_pd(*(a11 + (1 * cs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm2, zmm9, zmm10);
+      zmm3  = _mm512_set1_pd(*(a11 + (2 * cs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm3, zmm9, zmm11);
+      zmm4  = _mm512_set1_pd(*(a11 + (3 * cs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm4, zmm9, zmm12);
+      zmm5  = _mm512_set1_pd(*(a11 + (4 * cs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm5, zmm9, zmm13);
+      zmm6  = _mm512_set1_pd(*(a11 + (5 * cs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm6, zmm9, zmm14);
+      zmm7  = _mm512_set1_pd(*(a11 + (6 * cs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm7, zmm9, zmm15);
+      zmm8  = _mm512_set1_pd(*(a11 + (7 * cs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm8, zmm9, zmm16);
+      zmm10 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm10, zmm1);
+      a11 += rs_a;
+
+      // extract a22
+      zmm0  = _mm512_set1_pd(*(d11_pack + 2));
+      zmm2  = _mm512_set1_pd(*(a11 + (2 * cs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm2, zmm10, zmm11);
+      zmm3  = _mm512_set1_pd(*(a11 + (3 * cs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm3, zmm10, zmm12);
+      zmm4  = _mm512_set1_pd(*(a11 + (4 * cs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm4, zmm10, zmm13);
+      zmm5  = _mm512_set1_pd(*(a11 + (5 * cs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm5, zmm10, zmm14);
+      zmm6  = _mm512_set1_pd(*(a11 + (6 * cs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm6, zmm10, zmm15);
+      zmm7  = _mm512_set1_pd(*(a11 + (7 * cs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm7, zmm10, zmm16);
+      zmm11 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm11, zmm0);
+      a11 += rs_a;
+
+      // extract a33
+      zmm1  = _mm512_set1_pd(*(d11_pack + 3));
+      zmm2  = _mm512_set1_pd(*(a11 + (3 * cs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm2, zmm11, zmm12);
+      zmm3  = _mm512_set1_pd(*(a11 + (4 * cs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm3, zmm11, zmm13);
+      zmm4  = _mm512_set1_pd(*(a11 + (5 * cs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm4, zmm11, zmm14);
+      zmm5  = _mm512_set1_pd(*(a11 + (6 * cs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm5, zmm11, zmm15);
+      zmm6  = _mm512_set1_pd(*(a11 + (7 * cs_a)));
+      zmm16  = _mm512_fnmadd_pd(zmm6, zmm11, zmm16);
+      zmm12 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm12, zmm1);
+      a11 += rs_a;
+
+      // extract a44
+      zmm0 = _mm512_set1_pd(*(d11_pack + 4));
+      zmm2  = _mm512_set1_pd(*(a11 + (4 * cs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm2, zmm12, zmm13);
+      zmm3  = _mm512_set1_pd(*(a11 + (5 * cs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm3, zmm12, zmm14);
+      zmm4  = _mm512_set1_pd(*(a11 + (6 * cs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm4, zmm12, zmm15);
+      zmm5  = _mm512_set1_pd(*(a11 + (7 * cs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm5, zmm12, zmm16);
+      zmm13 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm13, zmm0);
+      a11 += rs_a;
+
+      // extract a55
+      zmm1  = _mm512_set1_pd(*(d11_pack + 5));
+      zmm2  = _mm512_set1_pd(*(a11 + (5 * cs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm2, zmm13, zmm14);
+      zmm3  = _mm512_set1_pd(*(a11 + (6 * cs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm3, zmm13, zmm15);
+      zmm4  = _mm512_set1_pd(*(a11 + (7 * cs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm4, zmm13, zmm16);
+      zmm14 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm14, zmm1);
+      a11 += rs_a;
+
+      // extract a66
+      zmm0 = _mm512_set1_pd(*(d11_pack + 6));
+      zmm2 = _mm512_set1_pd(*(a11 + (6 * cs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm2, zmm14, zmm15);
+      zmm3 = _mm512_set1_pd(*(a11 + (7 * cs_a)));
+      zmm16 = _mm512_fnmadd_pd(zmm3, zmm14, zmm16);
+      zmm15 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm15, zmm0);
+      a11 += rs_a;
+
+      // extract a77
+      zmm1 = _mm512_set1_pd(*(d11_pack + 7));
+      zmm2 = _mm512_set1_pd(*(a11 + 7 * cs_a));
+      zmm16 = _mm512_fnmadd_pd(zmm2, zmm15, zmm16);
+      zmm16 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm16, zmm1);
+
+      BLIS_DTRSM_SMALL_NREG_TRANSPOSE_8x8_AND_STORE(b11, cs_b)
+      _mm512_storeu_pd((double *)(b11 + (cs_b * 0)), zmm0);
+      _mm512_storeu_pd((double *)(b11 + (cs_b * 1)), zmm1);
+      _mm512_storeu_pd((double *)(b11 + (cs_b * 2)), zmm2);
+      _mm512_storeu_pd((double *)(b11 + (cs_b * 3)), zmm3);
+      _mm512_storeu_pd((double *)(b11 + (cs_b * 4)), zmm4);
+      _mm512_storeu_pd((double *)(b11 + (cs_b * 5)), zmm5);
+      _mm512_storeu_pd((double *)(b11 + (cs_b * 6)), zmm6);
+      _mm512_storeu_pd((double *)(b11 + (cs_b * 7)), zmm7);
+    }
+    dim_t n_rem = n - j;
+    if (n_rem >= 4)
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+      b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+      b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+      k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+      /// GEMM code begins///
+      BLIS_DTRSM_SMALL_GEMM_8mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+      ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 0)));
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 1)));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm2 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2)));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm3 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 3)));
+      // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+      ymm4 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 0) + 4));
+      // B11[0][4] B11[1][4] B11[2][4] B11[3][4]
+      ymm5 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 1) + 4));
+      // B11[0][5] B11[1][5] B11[2][5] B11[3][5]
+      ymm6 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 2) + 4));
+      // B11[0][6] B11[1][6] B11[2][6] B11[3][6]
+      ymm7 = _mm256_loadu_pd((double const *)(b11 + (cs_b * 3) + 4));
+      // B11[0][7] B11[1][7] B11[2][7] B11[3][7]
+
+      ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);  // B11[0-3][0] * alpha -= B01[0-3][0]
+      ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);  // B11[0-3][1] * alpha -= B01[0-3][1]
+      ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); // B11[0-3][2] * alpha -= B01[0-3][2]
+      ymm3 = _mm256_fmsub_pd(ymm3, ymm16, ymm11); // B11[0-3][3] * alpha -= B01[0-3][3]
+      ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); // B11[0-3][4] * alpha -= B01[0-3][4]
+      ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); // B11[0-3][5] * alpha -= B01[0-3][5]
+      ymm6 = _mm256_fmsub_pd(ymm6, ymm16, ymm14); // B11[0-3][6] * alpha -= B01[0-3][6]
+      ymm7 = _mm256_fmsub_pd(ymm7, ymm16, ymm15); // B11[0-3][7] * alpha -= B01[0-3][7]
+
+      /// implement TRSM///
+
+      /// transpose of B11//
+      /// unpacklow///
+      ymm9 = _mm256_unpacklo_pd(ymm0, ymm1);  // B11[0][0] B11[0][1] B11[2][0] B11[2][1]
+      ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); // B11[0][2] B11[0][3] B11[2][2] B11[2][3]
+
+      ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); // B11[0][4] B11[0][5] B11[2][4] B11[2][5]
+      ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); // B11[0][6] B11[0][7] B11[2][6] B11[2][7]
+
+      // rearrange low elements
+      ymm8 = _mm256_permute2f128_pd(ymm9, ymm11, 0x20);  // B11[0][0] B11[0][1] B11[0][2] B11[0][3]
+      ymm10 = _mm256_permute2f128_pd(ymm9, ymm11, 0x31); // B11[2][0] B11[2][1] B11[2][2] B11[2][3]
+
+      ymm12 = _mm256_permute2f128_pd(ymm13, ymm15, 0x20); // B11[4][0] B11[4][1] B11[4][2] B11[4][3]
+      ymm14 = _mm256_permute2f128_pd(ymm13, ymm15, 0x31); // B11[6][0] B11[6][1] B11[6][2] B11[6][3]
+
+      ////unpackhigh////
+      ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); // B11[1][0] B11[1][1] B11[3][0] B11[3][1]
+      ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); // B11[1][2] B11[1][3] B11[3][2] B11[3][3]
+
+      ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); // B11[1][4] B11[1][5] B11[3][4] B11[3][5]
+      ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); // B11[1][6] B11[1][7] B11[3][6] B11[3][7]
+
+      // rearrange high elements
+      ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);  // B11[1][0] B11[1][1] B11[1][2] B11[1][3]
+      ymm11 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31); // B11[3][0] B11[3][1] B11[3][2] B11[3][3]
+      
+      ymm13 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20); // B11[5][0] B11[5][1] B11[5][2] B11[5][3]
+      ymm15 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31); // B11[7][0] B11[7][1] B11[7][2] B11[7][3]
+
+      ymm0 = _mm256_broadcast_sd((double const *)&ones);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      // perform mul operation
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      ymm2  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 1)));
+      ymm3  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 2)));
+      ymm4  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 3)));
+      ymm5  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 4)));
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      //(ROw1): FMA operations
+      ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9);
+      ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10);
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm8, ymm11);
+      ymm12 = _mm256_fnmadd_pd(ymm5, ymm8, ymm12);
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm8, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm8, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm8, ymm15);
+
+      // perform mul operation
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm1);
+
+      ymm3  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 2)));
+      ymm4  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 3)));
+      ymm5  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 4)));
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(ROw2): FMA operations
+      ymm10 = _mm256_fnmadd_pd(ymm3, ymm9, ymm10);
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm9, ymm11);
+      ymm12 = _mm256_fnmadd_pd(ymm5, ymm9, ymm12);
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm9, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm9, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm9, ymm15);
+
+      // perform mul operation
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+
+      ymm4  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 3)));
+      ymm5  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 4)));
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(ROw5): FMA operations
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm10, ymm11);
+      ymm12 = _mm256_fnmadd_pd(ymm5, ymm10, ymm12);
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm10, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm10, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm10, ymm15);
+
+      // perform mul operation
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+
+      ymm0 = _mm256_broadcast_sd((double const *)&ones);
+
+      // extract a44
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 4));
+
+      ymm5  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 4)));
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      //(ROw4): FMA operations
+      ymm12 = _mm256_fnmadd_pd(ymm5, ymm11, ymm12);
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm11, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm11, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm11, ymm15);
+
+      // perform mul operation
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      // extract a55
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 5));
+
+      //(ROw5): FMA operations
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm12, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm12, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm12, ymm15);
+
+      // perform mul operation
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      // extract a66
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 6));
+
+      //(ROw6): FMA operations
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm13, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm13, ymm15);
+
+      // perform mul operation
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+
+      // extract a77
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 7));
+
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 7));
+
+      a11 += rs_a;
+      //(ROw7): FMA operations
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm14, ymm15);
+
+      // perform mul operation
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+
+      // unpacklow//
+      ymm1 = _mm256_unpacklo_pd(ymm8, ymm9);
+      // B11[0][0] B11[1][0] B11[0][2] B11[1][2]
+      ymm3 = _mm256_unpacklo_pd(ymm10, ymm11);
+      // B11[2][0] B11[3][0] B11[2][2] B11[3][2]
+
+      ymm5 = _mm256_unpacklo_pd(ymm12, ymm13);
+      // B11[4][0] B11[5][0] B11[4][2] B11[5][2]
+      ymm7 = _mm256_unpacklo_pd(ymm14, ymm15);
+      // B11[6][0] B11[7][0] B11[6][2] B11[7][2]
+
+      // rearrange low elements
+      ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20);
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31);
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+      ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20);
+      // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+      ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31);
+      // B11[4][2] B11[5][2] B11[6][2] B11[7][2]
+
+      /// unpack high///
+      ymm8 = _mm256_unpackhi_pd(ymm8, ymm9);
+      // B11[0][1] B11[1][1] B11[0][3] B11[1][3]
+      ymm9 = _mm256_unpackhi_pd(ymm10, ymm11);
+      // B11[2][1] B11[3][1] B11[2][3] B11[3][3]
+
+      ymm12 = _mm256_unpackhi_pd(ymm12, ymm13);
+      // B11[4][1] B11[5][1] B11[4][3] B11[5][3]
+      ymm13 = _mm256_unpackhi_pd(ymm14, ymm15);
+      // B11[6][1] B11[7][1] B11[6][3] B11[7][3]
+
+      // rearrange high elements
+      ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20);
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31);
+      // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      ymm5 = _mm256_permute2f128_pd(ymm12, ymm13, 0x20);
+      // B11[4][1] B11[5][1] B11[6][1] B11[7][1]
+      ymm7 = _mm256_permute2f128_pd(ymm12, ymm13, 0x31);
+      // B11[4][3] B11[5][3] B11[6][3] B11[7][3]
+
+      _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);   // store B11[0][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1);   // store B11[1][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2);   // store B11[2][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3);   // store B11[3][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 0 + 4), ymm4); // store B11[4][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 1 + 4), ymm5); // store B11[5][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 2 + 4), ymm6); // store B11[6][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 3 + 4), ymm7); // store B11[7][0-3]
+
+      n_rem -= 4;
+      j += 4;
+    }
+
+    if (n_rem)
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+      b01 = B + j * cs_b;        // pointer to block of B to be used for GEMM
+      b11 = B + i + j * cs_b;      // pointer to block of B to be used for TRSM
+
+      k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+      if (3 == n_rem)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_8mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); 
+        // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+        ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); 
+        // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+        ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); 
+
+        // B11[0][4] B11[1][4] B11[2][4] B11[3][4]
+        ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0 + 4));
+        // B11[0][5] B11[1][5] B11[2][5] B11[3][5]
+        ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1 + 4));
+        // B11[0][6] B11[1][6] B11[2][6] B11[3][6]
+        ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2 + 4));
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);  
+        // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);  
+        // B11[0-3][1] * alpha -= B01[0-3][1]
+        ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); 
+        // B11[0-3][2] * alpha -= B01[0-3][2]
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+
+        ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); 
+        // B11[0-3][4] * alpha -= B01[0-3][4]
+        ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); 
+        // B11[0-3][5] * alpha -= B01[0-3][5]
+        ymm6 = _mm256_fmsub_pd(ymm6, ymm16, ymm14); 
+        // B11[0-3][6] * alpha -= B01[0-3][6]
+        ymm7 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      else if (2 == n_rem)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_8mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); 
+        // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+        ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1));
+
+        // B11[0][4] B11[1][4] B11[2][4] B11[3][4]
+        ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0 + 4));
+        // B11[0][5] B11[1][5] B11[2][5] B11[3][5]
+        ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1 + 4));
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); // B11[0-3][1] * alpha -= B01[0-3][1]
+        ymm2 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+
+        ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); // B11[0-3][4] * alpha -= B01[0-3][4]
+        ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); // B11[0-3][5] * alpha -= B01[0-3][5]
+        ymm6 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm7 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      else if (1 == n_rem)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_8mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); 
+
+        // B11[0][4] B11[1][4] B11[2][4] B11[3][4]
+        ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0 + 4)); 
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm2 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+
+        ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); // B11[0-3][4] * alpha -= B01[0-3][4]
+        ymm5 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm6 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm7 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      /// implement TRSM///
+
+      /// transpose of B11//
+      /// unpacklow///
+      ymm9 = _mm256_unpacklo_pd(ymm0, ymm1);  // B11[0][0] B11[0][1] B11[2][0] B11[2][1]
+      ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); // B11[0][2] B11[0][3] B11[2][2] B11[2][3]
+
+      ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); // B11[0][4] B11[0][5] B11[2][4] B11[2][5]
+      ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); // B11[0][6] B11[0][7] B11[2][6] B11[2][7]
+
+      // rearrange low elements
+      ymm8 = _mm256_permute2f128_pd(ymm9, ymm11, 0x20);  // B11[0][0] B11[0][1] B11[0][2] B11[0][3]
+      ymm10 = _mm256_permute2f128_pd(ymm9, ymm11, 0x31); // B11[2][0] B11[2][1] B11[2][2] B11[2][3]
+
+      ymm12 = _mm256_permute2f128_pd(ymm13, ymm15, 0x20); // B11[4][0] B11[4][1] B11[4][2] B11[4][3]
+      ymm14 = _mm256_permute2f128_pd(ymm13, ymm15, 0x31); // B11[6][0] B11[6][1] B11[6][2] B11[6][3]
+
+      ////unpackhigh////
+      ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); // B11[1][0] B11[1][1] B11[3][0] B11[3][1]
+      ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); // B11[1][2] B11[1][3] B11[3][2] B11[3][3]
+
+      ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); // B11[1][4] B11[1][5] B11[3][4] B11[3][5]
+      ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); // B11[1][6] B11[1][7] B11[3][6] B11[3][7]
+
+      // rearrange high elements
+      ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);  // B11[1][0] B11[1][1] B11[1][2] B11[1][3]
+      ymm11 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31); // B11[3][0] B11[3][1] B11[3][2] B11[3][3]
+
+      ymm13 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20); // B11[5][0] B11[5][1] B11[5][2] B11[5][3]
+      ymm15 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31); // B11[7][0] B11[7][1] B11[7][2] B11[7][3]
+
+      ymm0 = _mm256_broadcast_sd((double const *)&ones);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      // perform mul operation
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      ymm2  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 1)));
+      ymm3  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 2)));
+      ymm4  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 3)));
+      ymm5  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 4)));
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      //(ROw1): FMA operations
+      ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9);
+      ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10);
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm8, ymm11);
+      ymm12 = _mm256_fnmadd_pd(ymm5, ymm8, ymm12);
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm8, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm8, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm8, ymm15);
+
+      // perform mul operation
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm1);
+
+      ymm3  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 2)));
+      ymm4  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 3)));
+      ymm5  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 4)));
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(ROw2): FMA operations
+      ymm10 = _mm256_fnmadd_pd(ymm3, ymm9, ymm10);
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm9, ymm11);
+      ymm12 = _mm256_fnmadd_pd(ymm5, ymm9, ymm12);
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm9, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm9, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm9, ymm15);
+
+      // perform mul operation
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+
+      ymm4  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 3)));
+      ymm5  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 4)));
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(ROw5): FMA operations
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm10, ymm11);
+      ymm12 = _mm256_fnmadd_pd(ymm5, ymm10, ymm12);
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm10, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm10, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm10, ymm15);
+
+      // perform mul operation
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+
+      ymm0 = _mm256_broadcast_sd((double const *)&ones);
+
+      // extract a44
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 4));
+
+      ymm5  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 4)));
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      //(ROw4): FMA operations
+      ymm12 = _mm256_fnmadd_pd(ymm5, ymm11, ymm12);
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm11, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm11, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm11, ymm15);
+
+      // perform mul operation
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+
+      ymm6  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 5)));
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 7));
+
+      a11 += rs_a;
+
+      // extract a55
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 5));
+
+      //(ROw5): FMA operations
+      ymm13 = _mm256_fnmadd_pd(ymm6, ymm12, ymm13);
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm12, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm12, ymm15);
+
+      // perform mul operation
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+
+      ymm7  = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 6)));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + (cs_a * 7)));
+
+      a11 += rs_a;
+
+      // extract a66
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 6));
+
+      //(ROw6): FMA operations
+      ymm14 = _mm256_fnmadd_pd(ymm7, ymm13, ymm14);
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm13, ymm15);
+
+      // perform mul operation
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+
+      // extract a77
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 7));
+
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 7));
+
+      a11 += rs_a;
+      //(ROw7): FMA operations
+      ymm15 = _mm256_fnmadd_pd(ymm16, ymm14, ymm15);
+
+      // perform mul operation
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+
+      // unpacklow//
+      ymm1 = _mm256_unpacklo_pd(ymm8, ymm9);   // B11[0][0] B11[1][0] B11[0][2] B11[1][2]
+      ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); // B11[2][0] B11[3][0] B11[2][2] B11[3][2]
+
+      ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); // B11[4][0] B11[5][0] B11[4][2] B11[5][2]
+      ymm7 = _mm256_unpacklo_pd(ymm14, ymm15); // B11[6][0] B11[7][0] B11[6][2] B11[7][2]
+
+      // rearrange low elements
+      ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+      ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20); // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+      ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31); // B11[4][2] B11[5][2] B11[6][2] B11[7][2]
+
+      /// unpack high///
+      ymm8 = _mm256_unpackhi_pd(ymm8, ymm9);   // B11[0][1] B11[1][1] B11[0][3] B11[1][3]
+      ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); // B11[2][1] B11[3][1] B11[2][3] B11[3][3]
+
+      ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); // B11[4][1] B11[5][1] B11[4][3] B11[5][3]
+      ymm13 = _mm256_unpackhi_pd(ymm14, ymm15); // B11[6][1] B11[7][1] B11[6][3] B11[7][3]
+
+      // rearrange high elements
+      ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      ymm5 = _mm256_permute2f128_pd(ymm12, ymm13, 0x20); // B11[4][1] B11[5][1] B11[6][1] B11[7][1]
+      ymm7 = _mm256_permute2f128_pd(ymm12, ymm13, 0x31); // B11[4][3] B11[5][3] B11[6][3] B11[7][3]
+
+      if (3 == n_rem)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);   // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1);   // store B11[1][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2);   // store B11[2][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0 + 4), ymm4); // store B11[4][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1 + 4), ymm5); // store B11[5][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 2 + 4), ymm6); // store B11[6][0-3]
+      }
+      else if (2 == n_rem)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);   // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1);   // store B11[1][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0 + 4), ymm4); // store B11[4][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1 + 4), ymm5); // store B11[5][0-3]
+      }
+      else if (1 == n_rem)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);   // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0 + 4), ymm4); // store B11[4][0-3]
+      }
+    }
+  }
+
+  //======================M remainder cases================================
+  dim_t m_rem = m - i;
+  
+  if (m_rem >= 4) // implementation for reamainder rows(when 'M' is not a multiple of d_mr)
+  {
+    a10 = L + (i * cs_a); // pointer to block of A to be used for GEMM
+    a11 = L + (i * rs_a) + (i * cs_a);
+    double *ptr_a10_dup = D_A_pack;
+    dim_t p_lda = 4; // packed leading dimension
+
+    if (transa)
+    {
+      for (dim_t x = 0; x < i; x += p_lda)
+      {
+        ymm0 = _mm256_loadu_pd((double const *)(a10));
+        ymm1 = _mm256_loadu_pd((double const *)(a10 + cs_a));
+        ymm2 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2));
+        ymm3 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3));
+
+        ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+        ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+        ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+        ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        _mm256_storeu_pd((double *)(ptr_a10_dup), ymm6);
+        _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda), ymm7);
+        _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 2), ymm8);
+        _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 3), ymm9);
+
+        a10 += p_lda;
+        ptr_a10_dup += p_lda * p_lda;
+      }
+    }
+    else
+    {
+      for (dim_t x = 0; x < i; x++)
+      {
+        ymm0 = _mm256_loadu_pd((double const *)(a10 + rs_a * x));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * x), ymm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      if (transa)
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 1 + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 2 + 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 3 + 3));
+      }
+      else
+      {
+        // broadcast diagonal elements of A11
+        ymm0 = _mm256_broadcast_sd((double const *)(a11));
+        ymm1 = _mm256_broadcast_sd((double const *)(a11 + rs_a * 1 + 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(a11 + rs_a * 2 + 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(a11 + rs_a * 3 + 3));
+      }
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    for (j = 0; (j + d_nr - 1) < n; j += d_nr) // loop along 'N' dimension
+    {
+      a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+      a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+      b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+      b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+      k_iter = i; // number of times GEMM operation to be done(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+      /// GEMM code begins///
+      BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter);
+      BLIS_DTRSM_SMALL_NREG_TRANSPOSE_4x8(b11, cs_b, AlphaVal);
+
+      // extract a00
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 0));
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm0);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 1 * cs_a));
+      ymm10 = _mm256_fnmadd_pd(ymm2, ymm9, ymm10);
+      ymm14 = _mm256_fnmadd_pd(ymm2, ymm13, ymm14);
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a));
+      ymm11 = _mm256_fnmadd_pd(ymm3, ymm9, ymm11);
+      ymm15 = _mm256_fnmadd_pd(ymm3, ymm13, ymm15);
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a));
+      ymm12 = _mm256_fnmadd_pd(ymm4, ymm9, ymm12);
+      ymm16 = _mm256_fnmadd_pd(ymm4, ymm13, ymm16);
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+      a11 += rs_a;
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a));
+      ymm11 = _mm256_fnmadd_pd(ymm2, ymm10, ymm11);
+      ymm15 = _mm256_fnmadd_pd(ymm2, ymm14, ymm15);
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a));
+      ymm12 = _mm256_fnmadd_pd(ymm3, ymm10, ymm12);
+      ymm16 = _mm256_fnmadd_pd(ymm3, ymm14, ymm16);
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm0);
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm0);
+      a11 += rs_a;
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a));
+      ymm12 = _mm256_fnmadd_pd(ymm2, ymm11, ymm12);
+      ymm16 = _mm256_fnmadd_pd(ymm2, ymm15, ymm16);
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+      ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm1);
+      
+      a11 += rs_a;
+
+      BLIS_DTRSM_SMALL_NREG_TRANSPOSE_4x8_AND_STORE(b11, cs_b)
+
+      _mm256_storeu_pd((double *)(b11 + 0 * cs_b), ymm0);
+      _mm256_storeu_pd((double *)(b11 + 1 * cs_b), ymm1);
+      _mm256_storeu_pd((double *)(b11 + 2 * cs_b), ymm2);
+      _mm256_storeu_pd((double *)(b11 + 3 * cs_b), ymm3);
+      _mm256_storeu_pd((double *)(b11 + 4 * cs_b), ymm4);
+      _mm256_storeu_pd((double *)(b11 + 5 * cs_b), ymm5);
+      _mm256_storeu_pd((double *)(b11 + 6 * cs_b), ymm6);
+      _mm256_storeu_pd((double *)(b11 + 7 * cs_b), ymm7);
+    }
+
+    dim_t n_rem = n - j;
+    if (n_rem >= 4)
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+      b01 = B + j * cs_b;        // pointer to block of B to be used for GEMM
+      b11 = B + i + j * cs_b;      // pointer to block of B to be used for TRSM
+
+      k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+      
+      BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+      ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+      /// implement TRSM///
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0));
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1));
+      ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2));
+      ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3));
+      ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);
+      ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);
+      ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10);
+      ymm3 = _mm256_fmsub_pd(ymm3, ymm16, ymm11);
+
+      /// transpose of B11//
+      /// unpacklow///
+      ymm9 = _mm256_unpacklo_pd(ymm0, ymm1);  // B11[0][0] B11[0][1] B11[2][0] B11[2][1]
+      ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); // B11[0][2] B11[0][3] B11[2][2] B11[2][3]
+
+      // rearrange low elements
+      ymm8 = _mm256_permute2f128_pd(ymm9, ymm11, 0x20);  // B11[0][0] B11[0][1] B11[0][2] B11[0][3]
+      ymm10 = _mm256_permute2f128_pd(ymm9, ymm11, 0x31); // B11[2][0] B11[2][1] B11[2][2] B11[2][3]
+
+      ////unpackhigh////
+      ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); // B11[1][0] B11[1][1] B11[3][0] B11[3][1]
+      ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); // B11[1][2] B11[1][3] B11[3][2] B11[3][3]
+
+      // rearrange high elements
+      ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);  // B11[1][0] B11[1][1] B11[1][2] B11[1][3]
+      ymm11 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31); // B11[3][0] B11[3][1] B11[3][2] B11[3][3]
+
+      ymm0 = _mm256_broadcast_sd((double const *)&ones);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      // perform mul operation
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm1);
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 1));
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 2));
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 3));
+
+      a11 += rs_a;
+
+      //(ROw1): FMA operations
+      ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9);
+      ymm13 = _mm256_fnmadd_pd(ymm2,ymm12,ymm13);
+      ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10);
+      ymm14 = _mm256_fnmadd_pd(ymm3,ymm12,ymm14);
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm8, ymm11);
+      ymm15 = _mm256_fnmadd_pd(ymm4, ymm12, ymm15);
+
+
+      // perform mul operation
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm1);
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 2));
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 3));
+
+      a11 += rs_a;
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(ROw2): FMA operations
+      ymm10 = _mm256_fnmadd_pd(ymm3, ymm9, ymm10);
+      ymm14 = _mm256_fnmadd_pd(ymm3, ymm13, ymm14);
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm9, ymm11);
+      ymm15 = _mm256_fnmadd_pd(ymm4, ymm13, ymm15);
+
+
+      // perform mul operation
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 3));
+
+      a11 += rs_a;
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(ROw5): FMA operations
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm10, ymm11);
+      ymm15 = _mm256_fnmadd_pd(ymm4, ymm14, ymm15);
+      // perform mul operation
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+
+      // unpacklow//
+      ymm1 = _mm256_unpacklo_pd(ymm8, ymm9);   // B11[0][0] B11[1][0] B11[0][2] B11[1][2]
+      ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); // B11[2][0] B11[3][0] B11[2][2] B11[3][2]
+
+      // rearrange low elements
+      ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+      /// unpack high///
+      ymm8 = _mm256_unpackhi_pd(ymm8, ymm9);   // B11[0][1] B11[1][1] B11[0][3] B11[1][3]
+      ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); // B11[2][1] B11[3][1] B11[2][3] B11[3][3]
+
+      // rearrange high elements
+      ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); // store B11[0][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); // store B11[1][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); // store B11[2][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); // store B11[3][0-3]
+
+      n_rem -= 4;
+      j += 4;
+    }
+    if (n_rem)
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+      b01 = B + j * cs_b;        // pointer to block of B to be used for GEMM
+      b11 = B + i + j * cs_b;      // pointer to block of B to be used for TRSM
+
+      k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+      ymm8 = _mm256_setzero_pd();
+      ymm9 = _mm256_setzero_pd();
+      ymm10 = _mm256_setzero_pd();
+
+      if (3 == n_rem)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0));
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1));
+        // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+        ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2));
+        // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);  // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);  // B11[0-3][1] * alpha -= B01[0-3][1]
+        ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); // B11[0-3][2] * alpha -= B01[0-3][2]
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      else if (2 == n_rem)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0));
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1));
+        // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); // B11[0-3][1] * alpha -= B01[0-3][1]
+        ymm2 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      else if (1 == n_rem)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm2 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+
+      /// transpose of B11//
+      /// unpacklow///
+      ymm9 = _mm256_unpacklo_pd(ymm0, ymm1);  // B11[0][0] B11[0][1] B11[2][0] B11[2][1]
+      ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); // B11[0][2] B11[0][3] B11[2][2] B11[2][3]
+
+      // rearrange low elements
+      ymm8 = _mm256_permute2f128_pd(ymm9, ymm11, 0x20);  // B11[0][0] B11[0][1] B11[0][2] B11[0][3]
+      ymm10 = _mm256_permute2f128_pd(ymm9, ymm11, 0x31); // B11[2][0] B11[2][1] B11[2][2] B11[2][3]
+
+      ////unpackhigh////
+      ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); // B11[1][0] B11[1][1] B11[3][0] B11[3][1]
+      ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); // B11[1][2] B11[1][3] B11[3][2] B11[3][3]
+
+      // rearrange high elements
+      ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);  // B11[1][0] B11[1][1] B11[1][2] B11[1][3]
+      ymm11 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31); // B11[3][0] B11[3][1] B11[3][2] B11[3][3]
+
+      ymm0 = _mm256_broadcast_sd((double const *)&ones);
+
+      ////extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      // perform mul operation
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm1);
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 1));
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 2));
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 3));
+
+      a11 += rs_a;
+
+      //(ROw1): FMA operations
+      ymm9 = _mm256_fnmadd_pd(ymm2, ymm8, ymm9);
+      ymm13 = _mm256_fnmadd_pd(ymm2, ymm12, ymm13);
+      ymm10 = _mm256_fnmadd_pd(ymm3, ymm8, ymm10);
+      ymm14 = _mm256_fnmadd_pd(ymm3, ymm12, ymm14);
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm8, ymm11);
+      ymm15 = _mm256_fnmadd_pd(ymm4, ymm12, ymm15);
+      
+      // perform mul operation
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm1);
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 2));
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 3));
+
+      a11 += rs_a;
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(ROw2): FMA operations
+      ymm10 = _mm256_fnmadd_pd(ymm3, ymm9, ymm10);
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm9, ymm11);
+
+      // perform mul operation
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + cs_a * 3));
+
+      a11 += rs_a;
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      //(ROw5): FMA operations
+      ymm11 = _mm256_fnmadd_pd(ymm4, ymm10, ymm11);
+
+      // perform mul operation
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+
+      // unpacklow//
+      ymm1 = _mm256_unpacklo_pd(ymm8, ymm9);   // B11[0][0] B11[1][0] B11[0][2] B11[1][2]
+      ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); // B11[2][0] B11[3][0] B11[2][2] B11[3][2]
+
+      // rearrange low elements
+      ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+      /// unpack high///
+      ymm8 = _mm256_unpackhi_pd(ymm8, ymm9);   // B11[0][1] B11[1][1] B11[0][3] B11[1][3]
+      ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); // B11[2][1] B11[3][1] B11[2][3] B11[3][3]
+
+      // rearrange high elements
+      ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      if (3 == n_rem)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); // store B11[1][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); // store B11[2][0-3]
+      }
+      else if (2 == n_rem)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); // store B11[1][0-3]
+      }
+      else if (1 == n_rem)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); // store B11[0][0-3]
+      }
+    }
+    m_rem -= 4;
+    i += 4;
+  }
+
+  if (m_rem)
+  {
+    a10 = L + (i * cs_a); // pointer to block of A to be used for GEMM
+    // Do transpose for a10 & store in D_A_pack
+    double *ptr_a10_dup = D_A_pack;
+    if (3 == m_rem) // Repetative A blocks will be 3*3
+    {
+      dim_t p_lda = 4; // packed leading dimension
+      if (transa)
+      {
+        for (dim_t x = 0; x < i; x += p_lda)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10));
+          ymm1 = _mm256_loadu_pd((double const *)(a10 + cs_a));
+          ymm2 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2));
+          ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+          ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+          ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+          ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+          ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+          ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+          ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+
+          ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+          ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+          _mm256_storeu_pd((double *)(ptr_a10_dup), ymm6);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda), ymm7);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 2), ymm8);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 3), ymm9);
+
+          a10 += p_lda;
+          ptr_a10_dup += p_lda * p_lda;
+        }
+      }
+      else
+      {
+        for (dim_t x = 0; x < i; x++)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10 + rs_a * x));
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * x), ymm0);
+        }
+      }
+
+      // cols
+      for (j = 0; (j + d_nr - 1) < n; j += d_nr) // loop along 'N' dimension
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm0 = _mm256_broadcast_sd((double const *)(&AlphaVal));
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0));
+        ymm1 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 0 + 2));
+        ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 1 + 2));
+        ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 2 + 2));
+        ymm3 = _mm256_insertf128_pd(ymm3, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 3));
+        ymm4 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 3 + 2));
+        ymm4 = _mm256_insertf128_pd(ymm4, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 4));
+        ymm5 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 4 + 2));
+        ymm5 = _mm256_insertf128_pd(ymm5, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 5));
+        ymm6 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 5 + 2));
+        ymm6 = _mm256_insertf128_pd(ymm6, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 6));
+        ymm7 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 6 + 2));
+        ymm7 = _mm256_insertf128_pd(ymm7, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 7));
+        ymm8 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 7 + 2));
+        ymm8 = _mm256_insertf128_pd(ymm8, xmm5, 0);
+
+        ymm9 =  _mm256_fmsub_pd(ymm1, ymm0, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm0, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm0, ymm11);
+        ymm12 = _mm256_fmsub_pd(ymm4, ymm0, ymm12);
+        ymm13 = _mm256_fmsub_pd(ymm5, ymm0, ymm13);
+        ymm14 = _mm256_fmsub_pd(ymm6, ymm0, ymm14);
+        ymm15 = _mm256_fmsub_pd(ymm7, ymm0, ymm15);
+        ymm16 = _mm256_fmsub_pd(ymm8, ymm0, ymm16);
+
+        _mm_storeu_pd((double *)(b11 + cs_b * 0), _mm256_extractf128_pd(ymm9, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 1), _mm256_extractf128_pd(ymm10, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 2), _mm256_extractf128_pd(ymm11, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 3), _mm256_extractf128_pd(ymm12, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 4), _mm256_extractf128_pd(ymm13, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 5), _mm256_extractf128_pd(ymm14, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 6), _mm256_extractf128_pd(ymm15, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 7), _mm256_extractf128_pd(ymm16, 0));
+
+        _mm_storel_pd((double *)(b11 + cs_b * 0 + 2), _mm256_extractf128_pd(ymm9, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 1 + 2), _mm256_extractf128_pd(ymm10, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 2 + 2), _mm256_extractf128_pd(ymm11, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 3 + 2), _mm256_extractf128_pd(ymm12, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 4 + 2), _mm256_extractf128_pd(ymm13, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 5 + 2), _mm256_extractf128_pd(ymm14, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 6 + 2), _mm256_extractf128_pd(ymm15, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 7 + 2), _mm256_extractf128_pd(ymm16, 1));
+
+        if (transa)
+          dtrsm_AutXB_ref(a11, b11, m_rem, 8, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AlXB_ref(a11, b11, m_rem, 8, rs_a, cs_b, is_unitdiag);
+      }
+
+      dim_t n_rem = n - j;
+      if ((n_rem >= 4))
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        /// implement TRSM///
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0));
+        ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 0 + 2));
+        ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1));
+        ymm1 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 1 + 2));
+        ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2));
+        ymm2 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 2 + 2));
+        ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 3));
+        ymm3 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 3 + 2));
+        ymm3 = _mm256_insertf128_pd(ymm3, xmm5, 0);
+
+        ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm16, ymm11);
+
+        _mm_storeu_pd((double *)(b11), _mm256_castpd256_pd128(ymm8));
+        _mm_storeu_pd((double *)(b11 + cs_b * 1), _mm256_castpd256_pd128(ymm9));
+        _mm_storeu_pd((double *)(b11 + cs_b * 2), _mm256_castpd256_pd128(ymm10));
+        _mm_storeu_pd((double *)(b11 + cs_b * 3), _mm256_castpd256_pd128(ymm11));
+
+        _mm_storel_pd((double *)(b11 + 2), _mm256_extractf128_pd(ymm8, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 1 + 2), _mm256_extractf128_pd(ymm9, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 2 + 2), _mm256_extractf128_pd(ymm10, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 3 + 2), _mm256_extractf128_pd(ymm11, 1));
+
+        if (transa)
+          dtrsm_AutXB_ref(a11, b11, m_rem, 4, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AlXB_ref(a11, b11, m_rem, 4, rs_a, cs_b, is_unitdiag);
+        n_rem -= 4;
+        j += 4;
+      }
+
+      if (n_rem)
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+        if (3 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_3M_3N(AlphaVal, b11, cs_b)
+
+          if (transa)
+              dtrsm_AutXB_ref(a11, b11, m_rem, 3, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AlXB_ref(a11, b11, m_rem, 3, rs_a, cs_b, is_unitdiag);
+        }
+        else if (2 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_3M_2N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AutXB_ref(a11, b11, m_rem, 2, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AlXB_ref(a11, b11, m_rem, 2, rs_a, cs_b, is_unitdiag);
+        }
+        else if (1 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_3M_1N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AutXB_ref(a11, b11, m_rem, 1, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AlXB_ref(a11, b11, m_rem, 1, rs_a, cs_b, is_unitdiag);
+        }
+      }
+    }
+    else if (2 == m_rem) // Repetative A blocks will be 2*2
+    {
+      dim_t p_lda = 4; // packed leading dimension
+      if (transa)
+      {
+        for (dim_t x = 0; x < i; x += p_lda)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10));
+          ymm1 = _mm256_loadu_pd((double const *)(a10 + cs_a));
+          ymm2 = _mm256_broadcast_sd((double const *)&ones);
+          ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+          ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+          ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+          ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+          ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+          ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+          ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+
+          ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+          ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+          _mm256_storeu_pd((double *)(ptr_a10_dup), ymm6);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda), ymm7);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 2), ymm8);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 3), ymm9);
+
+          a10 += p_lda;
+          ptr_a10_dup += p_lda * p_lda;
+        }
+      }
+      else
+      {
+        for (dim_t x = 0; x < i; x++)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10 + rs_a * x));
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * x), ymm0);
+        }
+      }
+      // cols
+      for (j = 0; (j + d_nr - 1) < n; j += d_nr) // loop along 'N' dimension
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter)
+        ymm0 = _mm256_broadcast_sd((double const *)(&AlphaVal));
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0));
+        ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1));
+        ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2));
+        ymm3 = _mm256_insertf128_pd(ymm3, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 3));
+        ymm4 = _mm256_insertf128_pd(ymm4, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 4));
+        ymm5 = _mm256_insertf128_pd(ymm5, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 5));
+        ymm6 = _mm256_insertf128_pd(ymm6, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 6));
+        ymm7 = _mm256_insertf128_pd(ymm7, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 7));
+        ymm8 = _mm256_insertf128_pd(ymm8, xmm5, 0);
+
+        ymm9 =  _mm256_fmsub_pd(ymm1, ymm0, ymm9);
+        ymm10 =  _mm256_fmsub_pd(ymm2, ymm0, ymm10);
+        ymm11 =  _mm256_fmsub_pd(ymm3, ymm0, ymm11);
+        ymm12 =  _mm256_fmsub_pd(ymm4, ymm0, ymm12);
+        ymm13 =  _mm256_fmsub_pd(ymm5, ymm0, ymm13);
+        ymm14 =  _mm256_fmsub_pd(ymm6, ymm0, ymm14);
+        ymm15 =  _mm256_fmsub_pd(ymm7, ymm0, ymm15);
+        ymm16 =  _mm256_fmsub_pd(ymm8, ymm0, ymm16);
+
+        _mm_storeu_pd((double *)(b11 + cs_b * 0), _mm256_extractf128_pd(ymm9, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 1), _mm256_extractf128_pd(ymm10, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 2), _mm256_extractf128_pd(ymm11, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 3), _mm256_extractf128_pd(ymm12, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 4), _mm256_extractf128_pd(ymm13, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 5), _mm256_extractf128_pd(ymm14, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 6), _mm256_extractf128_pd(ymm15, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 7), _mm256_extractf128_pd(ymm16, 0));
+ 
+        
+        if (transa)
+          dtrsm_AutXB_ref(a11, b11, m_rem, 8, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AlXB_ref(a11, b11, m_rem, 8, rs_a, cs_b, is_unitdiag);
+      }
+
+      dim_t n_rem = n - j;
+      if ((n_rem >= 4))
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+;
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        /// implement TRSM///
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0));
+        ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1));
+        ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2));
+        ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 3));
+        ymm3 = _mm256_insertf128_pd(ymm3, xmm5, 0);
+
+        ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm16, ymm11);
+
+        _mm_storeu_pd((double *)(b11), _mm256_castpd256_pd128(ymm8));
+        _mm_storeu_pd((double *)(b11 + cs_b * 1), _mm256_castpd256_pd128(ymm9));
+        _mm_storeu_pd((double *)(b11 + cs_b * 2), _mm256_castpd256_pd128(ymm10));
+        _mm_storeu_pd((double *)(b11 + cs_b * 3), _mm256_castpd256_pd128(ymm11));
+
+        if (transa)
+          dtrsm_AutXB_ref(a11, b11, m_rem, 4, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AlXB_ref(a11, b11, m_rem, 4, rs_a, cs_b, is_unitdiag);
+        n_rem -= 4;
+        j += 4;
+      }
+      if (n_rem)
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+        if (3 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_2M_3N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AutXB_ref(a11, b11, m_rem, 3, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AlXB_ref(a11, b11, m_rem, 3, rs_a, cs_b, is_unitdiag);
+        }
+        else if (2 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_2M_2N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AutXB_ref(a11, b11, m_rem, 2, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AlXB_ref(a11, b11, m_rem, 2, rs_a, cs_b, is_unitdiag);
+        }
+        else if (1 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_2M_1N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AutXB_ref(a11, b11, m_rem, 1, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AlXB_ref(a11, b11, m_rem, 1, rs_a, cs_b, is_unitdiag);
+        }
+      }
+      m_rem -= 2;
+      i += 2;
+    }
+    else if (1 == m_rem) // Repetative A blocks will be 1*1
+    {
+      dim_t p_lda = 4; // packed leading dimension
+      if (transa)
+      {
+        for (dim_t x = 0; x < i; x += p_lda)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10));
+          ymm1 = _mm256_broadcast_sd((double const *)&ones);
+          ymm2 = _mm256_broadcast_sd((double const *)&ones);
+          ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+          ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+          ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+          ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+          ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+          ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+          ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+
+          ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+          ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+          _mm256_storeu_pd((double *)(ptr_a10_dup), ymm6);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda), ymm7);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 2), ymm8);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 3), ymm9);
+
+          a10 += p_lda;
+          ptr_a10_dup += p_lda * p_lda;
+        }
+      }
+      else
+      {
+        for (dim_t x = 0; x < i; x++)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10 + rs_a * x));
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * x), ymm0);
+        }
+      }
+      // cols
+      for (j = 0; (j + d_nr - 1) < n; j += d_nr) // loop along 'N' dimension
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter);
+
+        /// GEMM code ends///
+        ymm0 = _mm256_broadcast_sd((double const*)(&AlphaVal));
+        ymm1 = _mm256_broadcast_sd((double const*)(b11 + cs_b * 0));
+        ymm2 = _mm256_broadcast_sd((double const*)(b11 + cs_b * 1));
+        ymm3 = _mm256_broadcast_sd((double const*)(b11 + cs_b * 2));
+        ymm4 = _mm256_broadcast_sd((double const*)(b11 + cs_b * 3));
+        ymm5 = _mm256_broadcast_sd((double const*)(b11 + cs_b * 4));
+        ymm6 = _mm256_broadcast_sd((double const*)(b11 + cs_b * 5));
+        ymm7 = _mm256_broadcast_sd((double const*)(b11 + cs_b * 6));
+        ymm8 = _mm256_broadcast_sd((double const*)(b11 + cs_b * 7));
+
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm0, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm0, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm0, ymm11);
+        ymm12 = _mm256_fmsub_pd(ymm4, ymm0, ymm12);
+        ymm13 = _mm256_fmsub_pd(ymm5, ymm0, ymm13);
+        ymm14 = _mm256_fmsub_pd(ymm6, ymm0, ymm14);
+        ymm15 = _mm256_fmsub_pd(ymm7, ymm0, ymm15);
+        ymm16 = _mm256_fmsub_pd(ymm8, ymm0, ymm16);
+
+        _mm_storel_pd((double *)(b11 + cs_b * 0), _mm256_extractf128_pd(ymm9, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 1), _mm256_extractf128_pd(ymm10, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 2), _mm256_extractf128_pd(ymm11, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 3), _mm256_extractf128_pd(ymm12, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 4), _mm256_extractf128_pd(ymm13, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 5), _mm256_extractf128_pd(ymm14, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 6), _mm256_extractf128_pd(ymm15, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 7), _mm256_extractf128_pd(ymm16, 0));
+
+        if (transa)
+          dtrsm_AutXB_ref(a11, b11, m_rem, 8, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AlXB_ref(a11, b11, m_rem, 8, rs_a, cs_b, is_unitdiag);
+      }
+      dim_t n_rem = n - j;
+      if ((n_rem >= 4))
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        /// implement TRSM///
+        ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 0));
+        ymm1 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 3));
+
+        ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm16, ymm11);
+
+        _mm_storel_pd((b11 + cs_b * 0), _mm256_extractf128_pd(ymm8, 0));
+        _mm_storel_pd((b11 + cs_b * 1), _mm256_extractf128_pd(ymm9, 0));
+        _mm_storel_pd((b11 + cs_b * 2), _mm256_extractf128_pd(ymm10, 0));
+        _mm_storel_pd((b11 + cs_b * 3), _mm256_extractf128_pd(ymm11, 0));
+
+        if (transa)
+          dtrsm_AutXB_ref(a11, b11, m_rem, 4, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AlXB_ref(a11, b11, m_rem, 4, rs_a, cs_b, is_unitdiag);
+        n_rem -= 4;
+        j += 4;
+      }
+
+      if (n_rem)
+      {
+        a10 = D_A_pack;          // pointer to block of A to be used for GEMM
+        a11 = L + (i * rs_a) + (i * cs_a); // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b);        // pointer to block of B to be used for GEMM
+        b11 = B + i + (j * cs_b);      // pointer to block of B to be used for TRSM
+
+        k_iter = i; // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        if (3 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_1M_3N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AutXB_ref(a11, b11, m_rem, 3, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AlXB_ref(a11, b11, m_rem, 3, rs_a, cs_b, is_unitdiag);
+        }
+        else if (2 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_1M_2N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AutXB_ref(a11, b11, m_rem, 2, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AlXB_ref(a11, b11, m_rem, 2, rs_a, cs_b, is_unitdiag);
+        }
+        else if (1 == n_rem)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_1M_1N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AutXB_ref(a11, b11, m_rem, 1, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AutXB_ref(a11, b11, m_rem, 1, rs_a, cs_b, is_unitdiag);
+        }
+      }
+      m_rem -= 1;
+      i += 1;
+    }
+  }
+
+  if ((required_packing_A == 1) &&
+    bli_mem_is_alloc(&local_mem_buf_A_s))
+  {
+    bli_membrk_release(&rntm, &local_mem_buf_A_s);
+  }
+  return BLIS_SUCCESS;
+}
+
+
+// LUNN LUTN
+BLIS_INLINE err_t bli_dtrsm_small_AltXB_AuXB_AVX512
+     (
+       obj_t*   AlphaObj,
+       obj_t*   a,
+       obj_t*   b,
+       cntx_t*  cntx,
+       cntl_t*  cntl
+     )
+{
+  dim_t m = bli_obj_length(b); //number of rows
+  dim_t n = bli_obj_width(b); // number of columns
+  bool transa = bli_obj_has_trans(a);
+  dim_t cs_a, rs_a;
+  dim_t d_mr = 8, d_nr = 8;
+
+  // Swap rs_a & cs_a in case of non-tranpose. 
+  if (transa)
+  {
+    cs_a = bli_obj_col_stride(a); // column stride of A
+    rs_a = bli_obj_row_stride(a); // row stride of A
+  }
+  else
+  {
+    cs_a = bli_obj_row_stride(a); // row stride of A
+    rs_a = bli_obj_col_stride(a); // column stride of B
+  }
+  dim_t cs_b = bli_obj_col_stride(b);  // column stride of B
+  dim_t i, j, k;
+  dim_t k_iter;
+  double AlphaVal = *(double *)AlphaObj->buffer;
+  double *L = bli_obj_buffer_at_off(a); // pointer to matrix A
+  double *B = bli_obj_buffer_at_off(b); // pointer to matrix B
+
+  double *a10, *a11, *b01, *b11; // pointers for GEMM and TRSM blocks
+
+  double ones = 1.0;
+
+  gint_t required_packing_A = 1;
+  mem_t local_mem_buf_A_s = {0};
+  double *D_A_pack = NULL; // pointer to A01 pack buffer
+  double d11_pack[d_mr] __attribute__((aligned(64))); // buffer for diagonal A pack
+  rntm_t rntm;
+
+  bli_rntm_init_from_global(&rntm);
+  bli_rntm_set_num_threads_only(1, &rntm);
+  bli_membrk_rntm_set_membrk(&rntm);
+
+  siz_t buffer_size = bli_pool_block_size(
+    bli_membrk_pool(
+      bli_packbuf_index(BLIS_BITVAL_BUFFER_FOR_A_BLOCK),
+      bli_rntm_membrk(&rntm)));
+
+  if ((d_mr * m * sizeof(double)) > buffer_size)
+    return BLIS_NOT_YET_IMPLEMENTED;
+
+  if (required_packing_A == 1)
+  {
+    // Get the buffer from the pool.
+    bli_membrk_acquire_m(&rntm,
+               buffer_size,
+               BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
+               &local_mem_buf_A_s);
+    if (FALSE == bli_mem_is_alloc(&local_mem_buf_A_s))
+      return BLIS_NULL_POINTER;
+    D_A_pack = bli_mem_buffer(&local_mem_buf_A_s);
+    if (NULL == D_A_pack)
+      return BLIS_NULL_POINTER;
+  }
+  bool is_unitdiag = bli_obj_has_unit_diag(a);
+
+  __m512d zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11;
+  __m512d zmm12, zmm13, zmm14, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, zmm21;
+  __m512d zmm22, zmm23, zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
+  __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11;
+  __m256d ymm12, ymm13, ymm14, ymm15, ymm16;
+  __m128d xmm5;
+
+  //gcc12 throws a unitialized warning,
+  //To avoid that these variable are set to zero.
+  ymm0 = _mm256_setzero_pd();
+  ymm1 = _mm256_setzero_pd();
+  ymm2 = _mm256_setzero_pd();
+  ymm3 = _mm256_setzero_pd();
+  ymm4 = _mm256_setzero_pd();
+  ymm5 = _mm256_setzero_pd();
+  ymm6 = _mm256_setzero_pd();
+  ymm7 = _mm256_setzero_pd();
+
+  /*
+        Performs solving TRSM for 8 columns at a time from 0 to m/d_mr in steps of d_mr
+        a. Load, transpose, Pack A (a10 block), the size of packing 8x8 to 8x (m-d_mr)
+           First there will be no GEMM and no packing of a10 because it is only TRSM
+        b. Using packed a10 block and b01 block perform GEMM operation
+        c. Use GEMM outputs, perform TRSM operaton using a11, b11 and update B
+        d. Repeat b,c for n rows of B in steps of d_nr
+  */
+
+  for (i = (m - d_mr); (i + 1) > 0; i -= d_mr)
+  {
+    a10 = L + (i * cs_a) + (i + d_mr) * rs_a;
+    a11 = L + (i * cs_a) + (i * rs_a);
+
+    dim_t p_lda = d_mr;
+    /*
+      Load, transpose and pack current A block (a10) into packed buffer memory D_A_pack
+      a. This a10 block is used in GEMM portion only and this
+        a10 block size will be increasing by d_mr for every next itteration
+        untill it reaches 8x(m-8) which is the maximum GEMM alone block size in A
+      b. This packed buffer is reused to calculate all n rows of B matrix
+    */
+    bli_dtrsm_small_pack_avx512('L', (m - i - d_mr), transa, a10, bli_obj_col_stride(a) , D_A_pack, p_lda, d_mr);
+
+    /*
+    Pack 8 diagonal elements of A block into an array
+    a. This helps in utilze cache line efficiently in TRSM operation
+    b. store ones when input is unit diagonal
+    */
+    dtrsm_small_pack_diag_element_avx512(is_unitdiag, a11, bli_obj_col_stride(a), d11_pack, d_mr);
+    
+    for (j = (n - d_nr); (j + 1) > 0; j -= d_nr)
+    {
+      a10 = D_A_pack;
+      b01 = B + (j * cs_b) + i + d_mr;      //pointer to block of B to be used for GEMM
+      b11 = B + (j * cs_b) + i;             //pointer to block of B to be used for TRSM
+
+      k_iter = (m - i - d_mr);
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_ZMM_REG_ZEROS
+
+      /*
+                Perform GEMM between a10 and b01 blocks
+                For first iteration there will be no GEMM operation
+                where k_iter are zero
+      */
+      BLIS_DTRSM_SMALL_GEMM_8mx8n_AVX512(a10, b01, cs_b, p_lda, k_iter, b11)
+
+      /*
+      Load b11 of size 8x8 and multiply with alpha
+      Add the GEMM output and perform in register transpose of b11
+      to perform TRSM operation.
+      */
+      BLIS_DTRSM_SMALL_NREG_TRANSPOSE_8x8(b11, cs_b, AlphaVal)
+
+      // extract a77
+      zmm0 = _mm512_set1_pd(*(d11_pack + 7));
+      zmm16 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm16, zmm0);
+
+      // extract a66
+      zmm0  = _mm512_set1_pd(*(a11 + (7 * rs_a) + (6 * cs_a)));
+      zmm1  = _mm512_set1_pd(*(a11 + (7 * rs_a) + (5 * cs_a)));
+      zmm15 = _mm512_fnmadd_pd(zmm0, zmm16, zmm15);
+      zmm0  = _mm512_set1_pd(*(a11 + (7 * rs_a) + (4 * cs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm1, zmm16, zmm14);
+      zmm1  = _mm512_set1_pd(*(a11 + (7 * rs_a) + (3 * cs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm0, zmm16, zmm13);
+      zmm0  = _mm512_set1_pd(*(a11 + (7 * rs_a) + (2 * cs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm1, zmm16, zmm12);
+      zmm1  = _mm512_set1_pd(*(a11 + (7 * rs_a) + (1 * cs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm16, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (7 * rs_a) + (0 * cs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm16, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 6));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm16, zmm9);
+      zmm15 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm15, zmm1);
+
+      // extract a55
+      zmm1  = _mm512_set1_pd(*(a11 + (6 * rs_a) + (5 * cs_a)));
+      zmm0  = _mm512_set1_pd(*(a11 + (6 * rs_a) + (4 * cs_a)));
+      zmm14 = _mm512_fnmadd_pd(zmm1, zmm15, zmm14);
+      zmm1  = _mm512_set1_pd(*(a11 + (6 * rs_a) + (3 * cs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm0, zmm15, zmm13);
+      zmm0  = _mm512_set1_pd(*(a11 + (6 * rs_a) + (2 * cs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm1, zmm15, zmm12);
+      zmm1  = _mm512_set1_pd(*(a11 + (6 * rs_a) + (1 * cs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm15, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (6 * rs_a) + (0 * cs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm15, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 5));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm15, zmm9);
+      zmm14 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm14, zmm1);
+
+      // extract a44
+      zmm0  = _mm512_set1_pd(*(a11 + (5 * rs_a) + (4 * cs_a)));
+      zmm1  = _mm512_set1_pd(*(a11 + (5 * rs_a) + (3 * cs_a)));
+      zmm13 = _mm512_fnmadd_pd(zmm0, zmm14, zmm13);
+      zmm0  = _mm512_set1_pd(*(a11 + (5 * rs_a) + (2 * cs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm1, zmm14, zmm12);
+      zmm1  = _mm512_set1_pd(*(a11 + (5 * rs_a) + (1 * cs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm14, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (5 * rs_a) + (0 * cs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm14, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 4));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm14, zmm9);
+      zmm13 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm13, zmm1);
+
+      // extract a33
+      zmm1  = _mm512_set1_pd(*(a11 + (4 * rs_a) + (3 * cs_a)));
+      zmm0  = _mm512_set1_pd(*(a11 + (4 * rs_a) + (2 * cs_a)));
+      zmm12 = _mm512_fnmadd_pd(zmm1, zmm13, zmm12);
+      zmm1  = _mm512_set1_pd(*(a11 + (4 * rs_a) + (1 * cs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm13, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (4 * rs_a) + (0 * cs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm13, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 3));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm13, zmm9);
+      zmm12 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm12, zmm1);
+
+      // extract a22
+      zmm0  = _mm512_set1_pd(*(a11 + (3 * rs_a) + (2 * cs_a)));
+      zmm1  = _mm512_set1_pd(*(a11 + (3 * rs_a) + (1 * cs_a)));
+      zmm11 = _mm512_fnmadd_pd(zmm0, zmm12, zmm11);
+      zmm0  = _mm512_set1_pd(*(a11 + (3 * rs_a) + (0 * cs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm12, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 2));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm12, zmm9);
+      zmm11 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm11, zmm1);
+
+      // extract a11
+      zmm1  = _mm512_set1_pd(*(a11 + (2 * rs_a) + (1 * cs_a)));
+      zmm0  = _mm512_set1_pd(*(a11 + (2 * rs_a) + (0 * cs_a)));
+      zmm10 = _mm512_fnmadd_pd(zmm1, zmm11, zmm10);
+      zmm1  = _mm512_set1_pd(*(d11_pack + 1));
+      zmm9  = _mm512_fnmadd_pd(zmm0, zmm11, zmm9);
+      zmm10 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm10, zmm1);
+
+      // extract a00
+      zmm1 = _mm512_set1_pd(*(a11 + (1 * rs_a) + (0 * cs_a)));
+      zmm0 = _mm512_set1_pd(*(d11_pack + 0));
+      zmm9 = _mm512_fnmadd_pd(zmm1, zmm10, zmm9);
+      zmm9 = DTRSM_SMALL_DIV_OR_SCALE_AVX512(zmm9, zmm0);
+
+      BLIS_DTRSM_SMALL_NREG_TRANSPOSE_8x8_AND_STORE(b11, cs_b)
+      _mm512_storeu_pd((double *)(b11 + cs_b * 0), zmm0);
+      _mm512_storeu_pd((double *)(b11 + cs_b * 1), zmm1);
+      _mm512_storeu_pd((double *)(b11 + cs_b * 2), zmm2);
+      _mm512_storeu_pd((double *)(b11 + cs_b * 3), zmm3);
+      _mm512_storeu_pd((double *)(b11 + cs_b * 4), zmm4);
+      _mm512_storeu_pd((double *)(b11 + cs_b * 5), zmm5);
+      _mm512_storeu_pd((double *)(b11 + cs_b * 6), zmm6);
+      _mm512_storeu_pd((double *)(b11 + cs_b * 7), zmm7);
+    }
+    dim_t n_remainder = j + d_nr;
+    if (n_remainder >= 4)
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * cs_a) + (i * rs_a);
+      b01 = B + ((n_remainder - 4) * cs_b) + i + d_mr;
+      b11 = B + ((n_remainder - 4) * cs_b) + i;
+
+      k_iter = (m - i - d_mr);
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+      /// GEMM code begins///
+      BLIS_DTRSM_SMALL_GEMM_8mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+      ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0));
+      // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1));
+      // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2));
+      // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+      ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3));
+      // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0 + 4));
+      // B11[0][4] B11[1][4] B11[2][4] B11[3][4]
+      ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1 + 4));
+      // B11[0][5] B11[1][5] B11[2][5] B11[3][5]
+      ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2 + 4));
+      // B11[0][6] B11[1][6] B11[2][6] B11[3][6]
+      ymm7 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3 + 4));
+      // B11[0][7] B11[1][7] B11[2][7] B11[3][7]
+
+      ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);  // B11[0-3][0] * alpha -= B01[0-3][0]
+      ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);  // B11[0-3][1] * alpha -= B01[0-3][1]
+      ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); // B11[0-3][2] * alpha -= B01[0-3][2]
+      ymm3 = _mm256_fmsub_pd(ymm3, ymm16, ymm11); // B11[0-3][3] * alpha -= B01[0-3][3]
+      ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); // B11[0-3][4] * alpha -= B01[0-3][4]
+      ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); // B11[0-3][5] * alpha -= B01[0-3][5]
+      ymm6 = _mm256_fmsub_pd(ymm6, ymm16, ymm14); // B11[0-3][6] * alpha -= B01[0-3][6]
+      ymm7 = _mm256_fmsub_pd(ymm7, ymm16, ymm15); // B11[0-3][7] * alpha -= B01[0-3][7]
+
+      /// implement TRSM///
+
+      /// transpose of B11//
+      /// unpacklow///
+      ymm9 = _mm256_unpacklo_pd(ymm0, ymm1);  // B11[0][0] B11[0][1] B11[2][0] B11[2][1]
+      ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); // B11[0][2] B11[0][3] B11[2][2] B11[2][3]
+
+      ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); // B11[0][4] B11[0][5] B11[2][4] B11[2][5]
+      ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); // B11[0][6] B11[0][7] B11[2][6] B11[2][7]
+
+      // rearrange low elements
+      ymm8 = _mm256_permute2f128_pd(ymm9, ymm11, 0x20);  // B11[0][0] B11[0][1] B11[0][2] B11[0][3]
+      ymm10 = _mm256_permute2f128_pd(ymm9, ymm11, 0x31); // B11[2][0] B11[2][1] B11[2][2] B11[2][3]
+
+      ymm12 = _mm256_permute2f128_pd(ymm13, ymm15, 0x20); // B11[4][0] B11[4][1] B11[4][2] B11[4][3]
+      ymm14 = _mm256_permute2f128_pd(ymm13, ymm15, 0x31); // B11[6][0] B11[6][1] B11[6][2] B11[6][3]
+
+      ////unpackhigh////
+      ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); // B11[1][0] B11[1][1] B11[3][0] B11[3][1]
+      ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); // B11[1][2] B11[1][3] B11[3][2] B11[3][3]
+
+      ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); // B11[1][4] B11[1][5] B11[3][4] B11[3][5]
+      ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); // B11[1][6] B11[1][7] B11[3][6] B11[3][7]
+
+      // rearrange high elements
+      ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);  // B11[1][0] B11[1][1] B11[1][2] B11[1][3]
+      ymm11 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31); // B11[3][0] B11[3][1] B11[3][2] B11[3][3]
+
+      ymm13 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20); // B11[5][0] B11[5][1] B11[5][2] B11[5][3]
+      ymm15 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31); // B11[7][0] B11[7][1] B11[7][2] B11[7][3]
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 7));
+
+      // perform mul operation
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 6));
+
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 6 * cs_a + 7 * rs_a));
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + 5 * cs_a + 7 * rs_a));
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + 4 * cs_a + 7 * rs_a));
+      ymm5 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a + 7 * rs_a));
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 7 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 7 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 7 * rs_a));
+
+      //(ROw7): FMA operations
+      ymm14 = _mm256_fnmadd_pd(ymm2, ymm15, ymm14);
+      ymm13 = _mm256_fnmadd_pd(ymm3, ymm15, ymm13);
+      ymm12 = _mm256_fnmadd_pd(ymm4, ymm15, ymm12);
+      ymm11 = _mm256_fnmadd_pd(ymm5, ymm15, ymm11);
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm15, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm15, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm15, ymm8);
+
+      // perform mul operation
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 5));
+
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + 5 * cs_a + 6 * rs_a));
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + 4 * cs_a + 6 * rs_a));
+      ymm5 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a + 6 * rs_a));
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 6 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 6 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 6 * rs_a));
+
+      //(ROw6): FMA operations
+      ymm13 = _mm256_fnmadd_pd(ymm3, ymm14, ymm13);
+      ymm12 = _mm256_fnmadd_pd(ymm4, ymm14, ymm12);
+      ymm11 = _mm256_fnmadd_pd(ymm5, ymm14, ymm11);
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm14, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm14, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm14, ymm8);
+
+      // perform mul operation
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 4));
+
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + 4 * cs_a + 5 * rs_a));
+      ymm5 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a + 5 * rs_a));
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 5 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 5 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 5 * rs_a));
+
+      //(ROw5): FMA operations
+      ymm12 = _mm256_fnmadd_pd(ymm4, ymm13, ymm12);
+      ymm11 = _mm256_fnmadd_pd(ymm5, ymm13, ymm11);
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm13, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm13, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm13, ymm8);
+
+      // perform mul operation
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      ymm5 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a + 4 * rs_a));
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 4 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 4 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 4 * rs_a));
+
+      //(ROw4): FMA operations
+      ymm11 = _mm256_fnmadd_pd(ymm5, ymm12, ymm11);
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm12, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm12, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm12, ymm8);
+
+      // perform mul operation
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 3 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 3 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 3 * rs_a));
+
+      //(ROw3): FMA operations
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm11, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm11, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm11, ymm8);
+
+      // perform mul operation
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 2 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 2 * rs_a));
+
+      //(ROw2): FMA operations
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm10, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm10, ymm8);
+
+      // perform mul operation
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm1);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 1 * rs_a));
+
+      //(ROw2): FMA operations
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm9, ymm8);
+
+      // perform mul operation
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm1);
+
+      // unpacklow//
+      ymm1 = _mm256_unpacklo_pd(ymm8, ymm9);   // B11[0][0] B11[1][0] B11[0][2] B11[1][2]
+      ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); // B11[2][0] B11[3][0] B11[2][2] B11[3][2]
+
+      ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); // B11[4][0] B11[5][0] B11[4][2] B11[5][2]
+      ymm7 = _mm256_unpacklo_pd(ymm14, ymm15); // B11[6][0] B11[7][0] B11[6][2] B11[7][2]
+
+      // rearrange low elements
+      ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+      ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20); // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+      ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31); // B11[4][2] B11[5][2] B11[6][2] B11[7][2]
+
+      /// unpack high///
+      ymm8 = _mm256_unpackhi_pd(ymm8, ymm9);   // B11[0][1] B11[1][1] B11[0][3] B11[1][3]
+      ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); // B11[2][1] B11[3][1] B11[2][3] B11[3][3]
+
+      ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); // B11[4][1] B11[5][1] B11[4][3] B11[5][3]
+      ymm13 = _mm256_unpackhi_pd(ymm14, ymm15); // B11[6][1] B11[7][1] B11[6][3] B11[7][3]
+
+      // rearrange high elements
+      ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      ymm5 = _mm256_permute2f128_pd(ymm12, ymm13, 0x20); // B11[4][1] B11[5][1] B11[6][1] B11[7][1]
+      ymm7 = _mm256_permute2f128_pd(ymm12, ymm13, 0x31); // B11[4][3] B11[5][3] B11[6][3] B11[7][3]
+
+      _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);   // store B11[0][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1);   // store B11[1][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2);   // store B11[2][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3);   // store B11[3][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 0 + 4), ymm4); // store B11[4][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 1 + 4), ymm5); // store B11[5][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 2 + 4), ymm6); // store B11[6][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 3 + 4), ymm7); // store B11[7][0-3]
+      n_remainder -= 4;
+    }
+
+    if (n_remainder) // implementation fo remaining columns(when 'N' is not a multiple of d_nr)() n = 3
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * cs_a) + (i * rs_a);
+      b01 = B + i + d_mr;
+      b11 = B + i;
+
+      k_iter = (m - i - d_mr);
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+      if (3 == n_remainder)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_8mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0));
+        // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1));
+        // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+        ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2));
+        // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+        ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0 + 4));
+        // B11[0][4] B11[1][4] B11[2][4] B11[3][4]
+        ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1 + 4));
+        // B11[0][5] B11[1][5] B11[2][5] B11[3][5]
+        ymm6 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2 + 4));
+        // B11[0][6] B11[1][6] B11[2][6] B11[3][6]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);  // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);  // B11[0-3][1] * alpha -= B01[0-3][1]
+        ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); // B11[0-3][2] * alpha -= B01[0-3][2]
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+
+        ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); // B11[0-3][4] * alpha -= B01[0-3][4]
+        ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); // B11[0-3][5] * alpha -= B01[0-3][5]
+        ymm6 = _mm256_fmsub_pd(ymm6, ymm16, ymm14); // B11[0-3][6] * alpha -= B01[0-3][6]
+        ymm7 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      else if (2 == n_remainder)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_8mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+
+        ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0 + 4)); // B11[0][4] B11[1][4] B11[2][4] B11[3][4]
+        ymm5 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1 + 4)); // B11[0][5] B11[1][5] B11[2][5] B11[3][5]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); // B11[0-3][1] * alpha -= B01[0-3][1]
+        ymm2 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+
+        ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); // B11[0-3][4] * alpha -= B01[0-3][4]
+        ymm5 = _mm256_fmsub_pd(ymm5, ymm16, ymm13); // B11[0-3][5] * alpha -= B01[0-3][5]
+        ymm6 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm7 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      else if (1 == n_remainder)
+      {
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_8mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+
+        ymm4 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0 + 4)); // B11[0][4] B11[1][4] B11[2][4] B11[3][4]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm2 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+
+        ymm4 = _mm256_fmsub_pd(ymm4, ymm16, ymm12); // B11[0-3][4] * alpha -= B01[0-3][4]
+        ymm5 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm6 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm7 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      /// implement TRSM///
+
+      /// transpose of B11//
+      /// unpacklow///
+      ymm9 = _mm256_unpacklo_pd(ymm0, ymm1);  // B11[0][0] B11[0][1] B11[2][0] B11[2][1]
+      ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); // B11[0][2] B11[0][3] B11[2][2] B11[2][3]
+
+      ymm13 = _mm256_unpacklo_pd(ymm4, ymm5); // B11[0][4] B11[0][5] B11[2][4] B11[2][5]
+      ymm15 = _mm256_unpacklo_pd(ymm6, ymm7); // B11[0][6] B11[0][7] B11[2][6] B11[2][7]
+
+      // rearrange low elements
+      ymm8 = _mm256_permute2f128_pd(ymm9, ymm11, 0x20);  // B11[0][0] B11[0][1] B11[0][2] B11[0][3]
+      ymm10 = _mm256_permute2f128_pd(ymm9, ymm11, 0x31); // B11[2][0] B11[2][1] B11[2][2] B11[2][3]
+
+      ymm12 = _mm256_permute2f128_pd(ymm13, ymm15, 0x20); // B11[4][0] B11[4][1] B11[4][2] B11[4][3]
+      ymm14 = _mm256_permute2f128_pd(ymm13, ymm15, 0x31); // B11[6][0] B11[6][1] B11[6][2] B11[6][3]
+
+      ////unpackhigh////
+      ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); // B11[1][0] B11[1][1] B11[3][0] B11[3][1]
+      ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); // B11[1][2] B11[1][3] B11[3][2] B11[3][3]
+
+      ymm4 = _mm256_unpackhi_pd(ymm4, ymm5); // B11[1][4] B11[1][5] B11[3][4] B11[3][5]
+      ymm5 = _mm256_unpackhi_pd(ymm6, ymm7); // B11[1][6] B11[1][7] B11[3][6] B11[3][7]
+
+      // rearrange high elements
+      ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);  // B11[1][0] B11[1][1] B11[1][2] B11[1][3]
+      ymm11 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31); // B11[3][0] B11[3][1] B11[3][2] B11[3][3]
+
+      ymm13 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20); // B11[5][0] B11[5][1] B11[5][2] B11[5][3]
+      ymm15 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31); // B11[7][0] B11[7][1] B11[7][2] B11[7][3]
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 7));
+
+      // perform mul operation
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 6));
+
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 6 * cs_a + 7 * rs_a));
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + 5 * cs_a + 7 * rs_a));
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + 4 * cs_a + 7 * rs_a));
+      ymm5 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a + 7 * rs_a));
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 7 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 7 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 7 * rs_a));
+
+      //(ROw7): FMA operations
+      ymm14 = _mm256_fnmadd_pd(ymm2, ymm15, ymm14);
+      ymm13 = _mm256_fnmadd_pd(ymm3, ymm15, ymm13);
+      ymm12 = _mm256_fnmadd_pd(ymm4, ymm15, ymm12);
+      ymm11 = _mm256_fnmadd_pd(ymm5, ymm15, ymm11);
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm15, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm15, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm15, ymm8);
+
+      // perform mul operation
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 5));
+
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + 5 * cs_a + 6 * rs_a));
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + 4 * cs_a + 6 * rs_a));
+      ymm5 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a + 6 * rs_a));
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 6 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 6 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 6 * rs_a));
+
+      //(ROw6): FMA operations
+      ymm13 = _mm256_fnmadd_pd(ymm3, ymm14, ymm13);
+      ymm12 = _mm256_fnmadd_pd(ymm4, ymm14, ymm12);
+      ymm11 = _mm256_fnmadd_pd(ymm5, ymm14, ymm11);
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm14, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm14, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm14, ymm8);
+
+      // perform mul operation
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm1);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 4));
+
+      ymm4 = _mm256_broadcast_sd((double const *)(a11 + 4 * cs_a + 5 * rs_a));
+      ymm5 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a + 5 * rs_a));
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 5 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 5 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 5 * rs_a));
+
+      //(ROw5): FMA operations
+      ymm12 = _mm256_fnmadd_pd(ymm4, ymm13, ymm12);
+      ymm11 = _mm256_fnmadd_pd(ymm5, ymm13, ymm11);
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm13, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm13, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm13, ymm8);
+
+      // perform mul operation
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      ymm5 = _mm256_broadcast_sd((double const *)(a11 + 3 * cs_a + 4 * rs_a));
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 4 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 4 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 4 * rs_a));
+
+      //(ROw4): FMA operations
+      ymm11 = _mm256_fnmadd_pd(ymm5, ymm12, ymm11);
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm12, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm12, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm12, ymm8);
+
+      // perform mul operation
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 3 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 3 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 3 * rs_a));
+
+      //(ROw3): FMA operations
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm11, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm11, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm11, ymm8);
+
+      // perform mul operation
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 2 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 2 * rs_a));
+
+      //(ROw2): FMA operations
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm10, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm10, ymm8);
+
+      // perform mul operation
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm1);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 1 * rs_a));
+
+      //(ROw2): FMA operations
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm9, ymm8);
+
+      // perform mul operation
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm1);
+
+      // unpacklow//
+      ymm1 = _mm256_unpacklo_pd(ymm8, ymm9);   // B11[0][0] B11[1][0] B11[0][2] B11[1][2]
+      ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); // B11[2][0] B11[3][0] B11[2][2] B11[3][2]
+
+      ymm5 = _mm256_unpacklo_pd(ymm12, ymm13); // B11[4][0] B11[5][0] B11[4][2] B11[5][2]
+      ymm7 = _mm256_unpacklo_pd(ymm14, ymm15); // B11[6][0] B11[7][0] B11[6][2] B11[7][2]
+
+      // rearrange low elements
+      ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+      ymm4 = _mm256_permute2f128_pd(ymm5, ymm7, 0x20); // B11[4][0] B11[5][0] B11[6][0] B11[7][0]
+      ymm6 = _mm256_permute2f128_pd(ymm5, ymm7, 0x31); // B11[4][2] B11[5][2] B11[6][2] B11[7][2]
+
+      /// unpack high///
+      ymm8 = _mm256_unpackhi_pd(ymm8, ymm9);   // B11[0][1] B11[1][1] B11[0][3] B11[1][3]
+      ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); // B11[2][1] B11[3][1] B11[2][3] B11[3][3]
+
+      ymm12 = _mm256_unpackhi_pd(ymm12, ymm13); // B11[4][1] B11[5][1] B11[4][3] B11[5][3]
+      ymm13 = _mm256_unpackhi_pd(ymm14, ymm15); // B11[6][1] B11[7][1] B11[6][3] B11[7][3]
+
+      // rearrange high elements
+      ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      ymm5 = _mm256_permute2f128_pd(ymm12, ymm13, 0x20); // B11[4][1] B11[5][1] B11[6][1] B11[7][1]
+      ymm7 = _mm256_permute2f128_pd(ymm12, ymm13, 0x31); // B11[4][3] B11[5][3] B11[6][3] B11[7][3]
+
+      if (3 == n_remainder)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);   // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1);   // store B11[1][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2);   // store B11[2][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0 + 4), ymm4); // store B11[4][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1 + 4), ymm5); // store B11[5][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 2 + 4), ymm6); // store B11[6][0-3]
+      }
+      else if (2 == n_remainder)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);   // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1);   // store B11[1][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0 + 4), ymm4); // store B11[4][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1 + 4), ymm5); // store B11[5][0-3]
+      }
+      else if (1 == n_remainder)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);   // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0 + 4), ymm4); // store B11[4][0-3]
+      }
+    }
+  }
+  dim_t m_remainder = i + d_mr;
+
+  if (m_remainder >= 4)
+  {
+    i = m_remainder - 4;
+    a10 = L + (i * cs_a) + (i + 4) * rs_a; // pointer to block of A to be used for GEMM
+    a11 = L + (i * cs_a) + (i * rs_a);   // pointer to block of A to be used for TRSM
+
+    // Do transpose for a10 & store in D_A_pack
+    double *ptr_a10_dup = D_A_pack;
+    dim_t p_lda = 4; // packed leading dimension
+    if (transa)
+    {
+      for (dim_t x = 0; x < m - i - 4; x += p_lda)
+      {
+        ymm0 = _mm256_loadu_pd((double const *)(a10));
+        ymm1 = _mm256_loadu_pd((double const *)(a10 + cs_a));
+        ymm2 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2));
+        ymm3 = _mm256_loadu_pd((double const *)(a10 + cs_a * 3));
+
+        ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+        ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+        ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+        ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+        ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+        ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+
+        ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+        ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+        _mm256_storeu_pd((double *)(ptr_a10_dup), ymm6);
+        _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda), ymm7);
+        _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 2), ymm8);
+        _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 3), ymm9);
+
+        a10 += p_lda;
+        ptr_a10_dup += p_lda * p_lda;
+      }
+    }
+    else
+    {
+      for (dim_t x = 0; x < m - i - 4; x++)
+      {
+        ymm0 = _mm256_loadu_pd((double const *)(a10 + x * rs_a));
+        _mm256_storeu_pd((double *)(ptr_a10_dup + x * p_lda), ymm0);
+      }
+    }
+
+    ymm4 = _mm256_broadcast_sd((double const *)&ones);
+    if (!is_unitdiag)
+    {
+      // broadcast diagonal elements of A11
+      ymm0 = _mm256_broadcast_sd((double const *)(a11));
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + bli_obj_col_stride(a) * 1 + 1));
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + bli_obj_col_stride(a) * 2 + 2));
+      ymm3 = _mm256_broadcast_sd((double const *)(a11 + bli_obj_col_stride(a) * 3 + 3));
+
+
+      ymm0 = _mm256_unpacklo_pd(ymm0, ymm1);
+      ymm1 = _mm256_unpacklo_pd(ymm2, ymm3);
+      ymm1 = _mm256_blend_pd(ymm0, ymm1, 0x0C);
+#ifdef BLIS_DISABLE_TRSM_PREINVERSION
+      ymm4 = ymm1;
+#endif
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+      ymm4 = _mm256_div_pd(ymm4, ymm1);
+#endif
+    }
+    _mm256_storeu_pd((double *)(d11_pack), ymm4);
+
+    // cols
+    for (j = (n - d_nr); (j + 1) > 0; j -= d_nr) // loop along 'N' dimension
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * cs_a) + (i * rs_a); // pointer to block of A to be used for TRSM
+      b01 = B + (j * cs_b) + i + 4;    // pointer to block of B to be used for GEMM
+      b11 = B + (j * cs_b) + i;      // pointer to block of B to be used for TRSM
+
+      k_iter = (m - i - 4); // number of times GEMM to be performed(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+      /// GEMM code begins///
+      BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter)
+      BLIS_DTRSM_SMALL_NREG_TRANSPOSE_4x8(b11, cs_b, AlphaVal)
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+      ymm12 = DTRSM_SMALL_DIV_OR_SCALE(ymm12, ymm1);
+      ymm16 = DTRSM_SMALL_DIV_OR_SCALE(ymm16, ymm1);
+
+      // extract a22
+      ymm0 = _mm256_broadcast_sd((double const *)(a11 + 3 * rs_a + 2 * cs_a));
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + 3 * rs_a + 1 * cs_a));
+      ymm11 = _mm256_fnmadd_pd(ymm0, ymm12, ymm11);
+      ymm15 = _mm256_fnmadd_pd(ymm0, ymm16, ymm15);
+      ymm0 = _mm256_broadcast_sd((double const *)(a11 + 3 * rs_a + 0 * cs_a));
+      ymm10 = _mm256_fnmadd_pd(ymm1, ymm12, ymm10);
+      ymm14 = _mm256_fnmadd_pd(ymm1, ymm16, ymm14);
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+      ymm9 = _mm256_fnmadd_pd(ymm0, ymm12, ymm9);
+      ymm13 = _mm256_fnmadd_pd(ymm0, ymm16, ymm13);
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+      ymm15 = DTRSM_SMALL_DIV_OR_SCALE(ymm15, ymm1);
+
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + 2 * rs_a + 1 * cs_a));
+      ymm0 = _mm256_broadcast_sd((double const *)(a11 + 2 * rs_a + 0 * cs_a));
+      ymm10 = _mm256_fnmadd_pd(ymm1, ymm11, ymm10);
+      ymm14 = _mm256_fnmadd_pd(ymm1, ymm15, ymm14);
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+      ymm9 = _mm256_fnmadd_pd(ymm0, ymm11, ymm9);
+      ymm13 = _mm256_fnmadd_pd(ymm0, ymm15, ymm13);
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+      ymm14 = DTRSM_SMALL_DIV_OR_SCALE(ymm14, ymm1);
+
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(a11 + 1 * rs_a + 0 * cs_a));
+      ymm0 = _mm256_broadcast_sd((double const *)(d11_pack + 0));
+      ymm9 = _mm256_fnmadd_pd(ymm1, ymm10, ymm9);
+      ymm13 = _mm256_fnmadd_pd(ymm1, ymm14, ymm13);
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm0);
+      ymm13 = DTRSM_SMALL_DIV_OR_SCALE(ymm13, ymm0);
+      
+
+      BLIS_DTRSM_SMALL_NREG_TRANSPOSE_4x8_AND_STORE(b11, cs_b)
+      _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0);
+      _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1);
+      _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2);
+      _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3);
+      _mm256_storeu_pd((double *)(b11 + cs_b * 4), ymm4);
+      _mm256_storeu_pd((double *)(b11 + cs_b * 5), ymm5);
+      _mm256_storeu_pd((double *)(b11 + cs_b * 6), ymm6);
+      _mm256_storeu_pd((double *)(b11 + cs_b * 7), ymm7);
+    }
+    dim_t n_remainder = j + d_nr;
+    if ((n_remainder >= 4))
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * cs_a) + (i * rs_a);      // pointer to block of A to be used for TRSM
+      b01 = B + ((n_remainder - 4) * cs_b) + i + 4; // pointer to block of B to be used for GEMM
+      b11 = B + ((n_remainder - 4) * cs_b) + i;   // pointer to block of B to be used for TRSM
+
+      k_iter = (m - i - 4); // number of times GEMM to be performed(in blocks of 4x4)
+
+      /*Fill zeros into ymm registers used in gemm accumulations */
+      BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+      /// GEMM code begins///
+      BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+      ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+      /// implement TRSM///
+
+      ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0));
+      ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1));
+      ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2));
+      ymm3 = _mm256_loadu_pd((double const *)(b11 + cs_b * 3));
+      ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);
+      ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);
+      ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10);
+      ymm3 = _mm256_fmsub_pd(ymm3, ymm16, ymm11);
+
+      /// transpose of B11//
+      /// unpacklow///
+      ymm9 = _mm256_unpacklo_pd(ymm0, ymm1);  // B11[0][0] B11[0][1] B11[2][0] B11[2][1]
+      ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); // B11[0][2] B11[0][3] B11[2][2] B11[2][3]
+
+      // rearrange low elements
+      ymm8 = _mm256_permute2f128_pd(ymm9, ymm11, 0x20);  // B11[0][0] B11[0][1] B11[0][2] B11[0][3]
+      ymm10 = _mm256_permute2f128_pd(ymm9, ymm11, 0x31); // B11[2][0] B11[2][1] B11[2][2] B11[2][3]
+
+      ////unpackhigh////
+      ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); // B11[1][0] B11[1][1] B11[3][0] B11[3][1]
+      ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); // B11[1][2] B11[1][3] B11[3][2] B11[3][3]
+
+      // rearrange high elements
+      ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);  // B11[1][0] B11[1][1] B11[1][2] B11[1][3]
+      ymm11 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31); // B11[3][0] B11[3][1] B11[3][2] B11[3][3]
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      // perform mul operation
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      //(ROw3): FMA operations
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 3 * rs_a));
+      ymm10 = _mm256_fnmadd_pd(ymm2, ymm11, ymm10);
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 3 * rs_a));
+      ymm9 = _mm256_fnmadd_pd(ymm2, ymm11, ymm9);
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 3 * rs_a));
+      ymm8 = _mm256_fnmadd_pd(ymm2, ymm11, ymm8);
+
+      // perform mul operation
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      //(ROw2): FMA operations
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 2 * rs_a));
+      ymm9 = _mm256_fnmadd_pd(ymm2, ymm10, ymm9);
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 2 * rs_a));
+      ymm8 = _mm256_fnmadd_pd(ymm2, ymm10, ymm8);
+
+      // perform mul operation
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm1);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      //(ROw2): FMA operations
+      ymm2 = _mm256_broadcast_sd((double const *)(a11 + 1 * rs_a));
+      ymm8 = _mm256_fnmadd_pd(ymm2, ymm9, ymm8);
+
+      // perform mul operation
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm1);
+
+      // unpacklow//
+      ymm1 = _mm256_unpacklo_pd(ymm8, ymm9);   // B11[0][0] B11[1][0] B11[0][2] B11[1][2]
+      ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); // B11[2][0] B11[3][0] B11[2][2] B11[3][2]
+
+      // rearrange low elements
+      ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+      /// unpack high///
+      ymm8 = _mm256_unpackhi_pd(ymm8, ymm9);   // B11[0][1] B11[1][1] B11[0][3] B11[1][3]
+      ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); // B11[2][1] B11[3][1] B11[2][3] B11[3][3]
+
+      // rearrange high elements
+      ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); // store B11[0][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); // store B11[1][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); // store B11[2][0-3]
+      _mm256_storeu_pd((double *)(b11 + cs_b * 3), ymm3); // store B11[3][0-3]
+      n_remainder = n_remainder - 4;
+    }
+
+    if (n_remainder) // implementation fo remaining columns(when 'N' is not a multiple of d_nr)() n = 3
+    {
+      a10 = D_A_pack;
+      a11 = L + (i * cs_a) + (i * rs_a);
+      b01 = B + i + 4;
+      b11 = B + i;
+
+      k_iter = (m - i - 4);
+
+      ymm8 = _mm256_setzero_pd();
+      ymm9 = _mm256_setzero_pd();
+      ymm10 = _mm256_setzero_pd();
+
+      if (3 == n_remainder)
+      {
+        BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+        ymm2 = _mm256_loadu_pd((double const *)(b11 + cs_b * 2)); // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);  // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);  // B11[0-3][1] * alpha -= B01[0-3][1]
+        ymm2 = _mm256_fmsub_pd(ymm2, ymm16, ymm10); // B11[0-3][2] * alpha -= B01[0-3][2]
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      else if (2 == n_remainder)
+      {
+        BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+        ymm1 = _mm256_loadu_pd((double const *)(b11 + cs_b * 1)); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_fmsub_pd(ymm1, ymm16, ymm9); // B11[0-3][1] * alpha -= B01[0-3][1]
+        ymm2 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+      else if (1 == n_remainder)
+      {
+        BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        ymm0 = _mm256_loadu_pd((double const *)(b11 + cs_b * 0)); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+
+        ymm0 = _mm256_fmsub_pd(ymm0, ymm16, ymm8); // B11[0-3][0] * alpha -= B01[0-3][0]
+        ymm1 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm2 = _mm256_broadcast_sd((double const *)(&ones));
+        ymm3 = _mm256_broadcast_sd((double const *)(&ones));
+      }
+
+      /// implement TRSM///
+
+      /// transpose of B11//
+      /// unpacklow///
+      ymm9 = _mm256_unpacklo_pd(ymm0, ymm1);  // B11[0][0] B11[0][1] B11[2][0] B11[2][1]
+      ymm11 = _mm256_unpacklo_pd(ymm2, ymm3); // B11[0][2] B11[0][3] B11[2][2] B11[2][3]
+
+      // rearrange low elements
+      ymm8 = _mm256_permute2f128_pd(ymm9, ymm11, 0x20);  // B11[0][0] B11[0][1] B11[0][2] B11[0][3]
+      ymm10 = _mm256_permute2f128_pd(ymm9, ymm11, 0x31); // B11[2][0] B11[2][1] B11[2][2] B11[2][3]
+
+      ////unpackhigh////
+      ymm0 = _mm256_unpackhi_pd(ymm0, ymm1); // B11[1][0] B11[1][1] B11[3][0] B11[3][1]
+      ymm1 = _mm256_unpackhi_pd(ymm2, ymm3); // B11[1][2] B11[1][3] B11[3][2] B11[3][3]
+
+      // rearrange high elements
+      ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);  // B11[1][0] B11[1][1] B11[1][2] B11[1][3]
+      ymm11 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31); // B11[3][0] B11[3][1] B11[3][2] B11[3][3]
+
+      // extract a33
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 3));
+
+      // perform mul operation
+      ymm11 = DTRSM_SMALL_DIV_OR_SCALE(ymm11, ymm1);
+
+      // extract a22
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 2));
+
+      ymm6 = _mm256_broadcast_sd((double const *)(a11 + 2 * cs_a + 3 * rs_a));
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 3 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 3 * rs_a));
+
+      //(ROw3): FMA operations
+      ymm10 = _mm256_fnmadd_pd(ymm6, ymm11, ymm10);
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm11, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm11, ymm8);
+
+      // perform mul operation
+      ymm10 = DTRSM_SMALL_DIV_OR_SCALE(ymm10, ymm1);
+
+      // extract a11
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack + 1));
+
+      ymm7 = _mm256_broadcast_sd((double const *)(a11 + cs_a + 2 * rs_a));
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 2 * rs_a));
+
+      //(ROw2): FMA operations
+      ymm9 = _mm256_fnmadd_pd(ymm7, ymm10, ymm9);
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm10, ymm8);
+
+      // perform mul operation
+      ymm9 = DTRSM_SMALL_DIV_OR_SCALE(ymm9, ymm1);
+
+      // extract a00
+      ymm1 = _mm256_broadcast_sd((double const *)(d11_pack));
+
+      ymm16 = _mm256_broadcast_sd((double const *)(a11 + 1 * rs_a));
+
+      //(ROw2): FMA operations
+      ymm8 = _mm256_fnmadd_pd(ymm16, ymm9, ymm8);
+
+      // perform mul operation
+      ymm8 = DTRSM_SMALL_DIV_OR_SCALE(ymm8, ymm1);
+
+      // unpacklow//
+      ymm1 = _mm256_unpacklo_pd(ymm8, ymm9);   // B11[0][0] B11[1][0] B11[0][2] B11[1][2]
+      ymm3 = _mm256_unpacklo_pd(ymm10, ymm11); // B11[2][0] B11[3][0] B11[2][2] B11[3][2]
+
+      // rearrange low elements
+      ymm0 = _mm256_permute2f128_pd(ymm1, ymm3, 0x20); // B11[0][0] B11[1][0] B11[2][0] B11[3][0]
+      ymm2 = _mm256_permute2f128_pd(ymm1, ymm3, 0x31); // B11[0][2] B11[1][2] B11[2][2] B11[3][2]
+
+      /// unpack high///
+      ymm8 = _mm256_unpackhi_pd(ymm8, ymm9);   // B11[0][1] B11[1][1] B11[0][3] B11[1][3]
+      ymm9 = _mm256_unpackhi_pd(ymm10, ymm11); // B11[2][1] B11[3][1] B11[2][3] B11[3][3]
+
+      // rearrange high elements
+      ymm1 = _mm256_permute2f128_pd(ymm8, ymm9, 0x20); // B11[0][1] B11[1][1] B11[2][1] B11[3][1]
+      ymm3 = _mm256_permute2f128_pd(ymm8, ymm9, 0x31); // B11[0][3] B11[1][3] B11[2][3] B11[3][3]
+
+      if (3 == n_remainder)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); // store B11[1][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 2), ymm2); // store B11[2][0-3]
+      }
+      else if (2 == n_remainder)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); // store B11[0][0-3]
+        _mm256_storeu_pd((double *)(b11 + cs_b * 1), ymm1); // store B11[1][0-3]
+      }
+      else if (1 == n_remainder)
+      {
+        _mm256_storeu_pd((double *)(b11 + cs_b * 0), ymm0); // store B11[0][0-3]
+      }
+    }
+    m_remainder -= 4;
+  }
+  if (m_remainder)
+  {
+
+    a10 = L + m_remainder * rs_a;
+
+    // Do transpose for a10 & store in D_A_pack
+    double *ptr_a10_dup = D_A_pack;
+    if (3 == m_remainder) // Repetative A blocks will be 3*3
+    {
+      dim_t p_lda = 4; // packed leading dimension
+      if (transa)
+      {
+        for (dim_t x = 0; x < m - m_remainder; x += p_lda)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10));
+          ymm1 = _mm256_loadu_pd((double const *)(a10 + cs_a));
+          ymm2 = _mm256_loadu_pd((double const *)(a10 + cs_a * 2));
+          ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+          ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+          ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+          ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+          ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+          ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+          ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+
+          ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+          ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+          _mm256_storeu_pd((double *)(ptr_a10_dup), ymm6);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda), ymm7);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 2), ymm8);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 3), ymm9);
+
+          a10 += p_lda;
+          ptr_a10_dup += p_lda * p_lda;
+        }
+      }
+      else
+      {
+        for (dim_t x = 0; x < m - m_remainder; x++)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10 + x * rs_a));
+          _mm256_storeu_pd((double *)(ptr_a10_dup + x * p_lda), ymm0);
+        }
+      }
+
+      // cols
+      for (j = (n - d_nr); (j + 1) > 0; j -= d_nr) // loop along 'N' dimension
+      {
+        a10 = D_A_pack;
+        a11 = L;              // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b) + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B + (j * cs_b);         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter)
+        ymm0 =_mm256_broadcast_sd((double const *)(&AlphaVal));
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0));
+        ymm1 =_mm256_broadcast_sd((double const *)(b11 + cs_b * 0 + 2));
+        ymm1 = _mm256_insertf64x2(ymm1, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1));
+        ymm2 =_mm256_broadcast_sd((double const *)(b11 + cs_b * 1 + 2));
+        ymm2 = _mm256_insertf64x2(ymm2, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2));
+        ymm3 =_mm256_broadcast_sd((double const *)(b11 + cs_b * 2 + 2));
+        ymm3 = _mm256_insertf64x2(ymm3, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 3));
+        ymm4 =_mm256_broadcast_sd((double const *)(b11 + cs_b * 3 + 2));
+        ymm4 = _mm256_insertf64x2(ymm4, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 4));
+        ymm5 =_mm256_broadcast_sd((double const *)(b11 + cs_b * 4 + 2));
+        ymm5 = _mm256_insertf64x2(ymm5, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 5));
+        ymm6 =_mm256_broadcast_sd((double const *)(b11 + cs_b * 5 + 2));
+        ymm6 = _mm256_insertf64x2(ymm6, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 6));
+        ymm7 =_mm256_broadcast_sd((double const *)(b11 + cs_b * 6 + 2));
+        ymm7 = _mm256_insertf64x2(ymm7, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 7));
+        ymm8 =_mm256_broadcast_sd((double const *)(b11 + cs_b * 7 + 2));
+        ymm8 = _mm256_insertf64x2(ymm8, xmm5, 0);
+
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm0, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm0, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm0, ymm11);
+        ymm12 = _mm256_fmsub_pd(ymm4, ymm0, ymm12);
+        ymm13 = _mm256_fmsub_pd(ymm5, ymm0, ymm13);
+        ymm14 = _mm256_fmsub_pd(ymm6, ymm0, ymm14);
+        ymm15 = _mm256_fmsub_pd(ymm7, ymm0, ymm15);
+        ymm16 = _mm256_fmsub_pd(ymm8, ymm0, ymm16);
+
+        _mm_storeu_pd((double *)(b11 + cs_b * 0), _mm256_extractf64x2_pd(ymm9, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 1), _mm256_extractf64x2_pd(ymm10, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 2), _mm256_extractf64x2_pd(ymm11, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 3), _mm256_extractf64x2_pd(ymm12, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 4), _mm256_extractf64x2_pd(ymm13, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 5), _mm256_extractf64x2_pd(ymm14, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 6), _mm256_extractf64x2_pd(ymm15, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 7), _mm256_extractf64x2_pd(ymm16, 0));
+
+        _mm_storel_pd((double *)(b11 + cs_b * 0 + 2), _mm256_extractf64x2_pd(ymm9, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 1 + 2), _mm256_extractf64x2_pd(ymm10, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 2 + 2), _mm256_extractf64x2_pd(ymm11, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 3 + 2), _mm256_extractf64x2_pd(ymm12, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 4 + 2), _mm256_extractf64x2_pd(ymm13, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 5 + 2), _mm256_extractf64x2_pd(ymm14, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 6 + 2), _mm256_extractf64x2_pd(ymm15, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 7 + 2), _mm256_extractf64x2_pd(ymm16, 1));
+
+        if (transa)
+          dtrsm_AltXB_ref(a11, b11, m_remainder, 8, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AuXB_ref(a11, b11, m_remainder, 8, rs_a, cs_b, is_unitdiag);
+      }
+
+      dim_t n_remainder = j + d_nr;
+      if ((n_remainder >= 4))
+      {
+        a10 = D_A_pack;
+        a11 = L;                      // pointer to block of A to be used for TRSM
+        b01 = B + ((n_remainder - 4) * cs_b) + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B + ((n_remainder - 4) * cs_b);         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        /// implement TRSM///
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0));
+        ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 0 + 2));
+        ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1));
+        ymm1 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 1 + 2));
+        ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2));
+        ymm2 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 2 + 2));
+        ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 3));
+        ymm3 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 3 + 2));
+        ymm3 = _mm256_insertf128_pd(ymm3, xmm5, 0);
+
+        ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm16, ymm11);
+
+        _mm_storeu_pd((double *)(b11), _mm256_castpd256_pd128(ymm8));
+        _mm_storeu_pd((double *)(b11 + cs_b * 1), _mm256_castpd256_pd128(ymm9));
+        _mm_storeu_pd((double *)(b11 + cs_b * 2), _mm256_castpd256_pd128(ymm10));
+        _mm_storeu_pd((double *)(b11 + cs_b * 3), _mm256_castpd256_pd128(ymm11));
+
+        _mm_storel_pd((double *)(b11 + 2), _mm256_extractf128_pd(ymm8, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 1 + 2), _mm256_extractf128_pd(ymm9, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 2 + 2), _mm256_extractf128_pd(ymm10, 1));
+        _mm_storel_pd((double *)(b11 + cs_b * 3 + 2), _mm256_extractf128_pd(ymm11, 1));
+
+        if (transa)
+          dtrsm_AltXB_ref(a11, b11, m_remainder, 4, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AuXB_ref(a11, b11, m_remainder, 4, rs_a, cs_b, is_unitdiag);
+        n_remainder -= 4;
+      }
+
+      if (n_remainder)
+      {
+        a10 = D_A_pack;
+        a11 = L;         // pointer to block of A to be used for TRSM
+        b01 = B + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B;         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+        if (3 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_3M_3N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 3, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AuXB_ref(a11, b11, m_remainder, 3, rs_a, cs_b, is_unitdiag);
+        }
+        else if (2 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_3M_2N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 2, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AuXB_ref(a11, b11, m_remainder, 2, rs_a, cs_b, is_unitdiag);
+        }
+        else if (1 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_3M_1N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 1, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AuXB_ref(a11, b11, m_remainder, 1, rs_a, cs_b, is_unitdiag);
+        }
+      }
+    }
+    else if (2 == m_remainder) // Repetative A blocks will be 2*2
+    {
+      dim_t p_lda = 4; // packed leading dimension
+      if (transa)
+      {
+        for (dim_t x = 0; x < m - m_remainder; x += p_lda)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10));
+          ymm1 = _mm256_loadu_pd((double const *)(a10 + cs_a));
+          ymm2 = _mm256_broadcast_sd((double const *)&ones);
+          ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+          ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+          ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+          ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+          ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+          ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+          ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+
+          ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+          ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+          _mm256_storeu_pd((double *)(ptr_a10_dup), ymm6);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda), ymm7);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 2), ymm8);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 3), ymm9);
+
+          a10 += p_lda;
+          ptr_a10_dup += p_lda * p_lda;
+        }
+      }
+      else
+      {
+        for (dim_t x = 0; x < m - m_remainder; x++)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10 + x * rs_a));
+          _mm256_storeu_pd((double *)(ptr_a10_dup + x * p_lda), ymm0);
+        }
+      }
+      // cols
+      for (j = (n - d_nr); (j + 1) > 0; j -= d_nr) // loop along 'N' dimension
+      {
+        a10 = D_A_pack;
+        a11 = L;              // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b) + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B + (j * cs_b);         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm0 = _mm256_broadcast_sd((double const *)(&AlphaVal));
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0));
+        ymm1 = _mm256_insertf64x2(ymm1, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1));
+        ymm2 = _mm256_insertf64x2(ymm2, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2));
+        ymm3 = _mm256_insertf64x2(ymm3, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 3));
+        ymm4 = _mm256_insertf64x2(ymm4, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 4));
+        ymm5 = _mm256_insertf64x2(ymm5, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 5));
+        ymm6 = _mm256_insertf64x2(ymm6, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 6));
+        ymm7 = _mm256_insertf64x2(ymm7, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 7));
+        ymm8 = _mm256_insertf64x2(ymm8, xmm5, 0);
+
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm0, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm0, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm0, ymm11);
+        ymm12 = _mm256_fmsub_pd(ymm4, ymm0, ymm12);
+        ymm13 = _mm256_fmsub_pd(ymm5, ymm0, ymm13);
+        ymm14 = _mm256_fmsub_pd(ymm6, ymm0, ymm14);
+        ymm15 = _mm256_fmsub_pd(ymm7, ymm0, ymm15);
+        ymm16 = _mm256_fmsub_pd(ymm8, ymm0, ymm16);
+
+        _mm_storeu_pd((double *)(b11 + cs_b * 0), _mm256_extractf64x2_pd(ymm9, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 1), _mm256_extractf64x2_pd(ymm10, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 2), _mm256_extractf64x2_pd(ymm11, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 3), _mm256_extractf64x2_pd(ymm12, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 4), _mm256_extractf64x2_pd(ymm13, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 5), _mm256_extractf64x2_pd(ymm14, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 6), _mm256_extractf64x2_pd(ymm15, 0));
+        _mm_storeu_pd((double *)(b11 + cs_b * 7), _mm256_extractf64x2_pd(ymm16, 0));
+
+        if (transa)
+          dtrsm_AltXB_ref(a11, b11, m_remainder, 8, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AuXB_ref(a11, b11, m_remainder, 8, rs_a, cs_b, is_unitdiag);
+      }
+      dim_t n_remainder = j + d_nr;
+      if ((n_remainder >= 4))
+      {
+        a10 = D_A_pack;
+        a11 = L;                      // pointer to block of A to be used for TRSM
+        b01 = B + ((n_remainder - 4) * cs_b) + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B + ((n_remainder - 4) * cs_b);         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+        /// GEMM code begins///
+        BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        /// implement TRSM///
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 0));
+        ymm0 = _mm256_insertf128_pd(ymm0, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 1));
+        ymm1 = _mm256_insertf128_pd(ymm1, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 2));
+        ymm2 = _mm256_insertf128_pd(ymm2, xmm5, 0);
+
+        xmm5 = _mm_loadu_pd((double const *)(b11 + cs_b * 3));
+        ymm3 = _mm256_insertf128_pd(ymm3, xmm5, 0);
+
+        ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm16, ymm11);
+
+        _mm_storeu_pd((double *)(b11), _mm256_castpd256_pd128(ymm8));
+        _mm_storeu_pd((double *)(b11 + cs_b * 1), _mm256_castpd256_pd128(ymm9));
+        _mm_storeu_pd((double *)(b11 + cs_b * 2), _mm256_castpd256_pd128(ymm10));
+        _mm_storeu_pd((double *)(b11 + cs_b * 3), _mm256_castpd256_pd128(ymm11));
+
+        if (transa)
+          dtrsm_AltXB_ref(a11, b11, m_remainder, 4, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AuXB_ref(a11, b11, m_remainder, 4, rs_a, cs_b, is_unitdiag);
+        n_remainder -= 4;
+      }
+      if (n_remainder)
+      {
+        a10 = D_A_pack;
+        a11 = L;         // pointer to block of A to be used for TRSM
+        b01 = B + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B;         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+        if (3 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_2M_3N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 3, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AuXB_ref(a11, b11, m_remainder, 3, rs_a, cs_b, is_unitdiag);
+        }
+        else if (2 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_2M_2N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 2, cs_a, cs_b, is_unitdiag);
+          else 
+            dtrsm_AuXB_ref(a11, b11, m_remainder, 2, rs_a, cs_b, is_unitdiag);
+        }
+        else if (1 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_2M_1N(AlphaVal, b11, cs_b) 
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 1, cs_a, cs_b, is_unitdiag);
+          else 
+            dtrsm_AuXB_ref(a11, b11, m_remainder, 1, rs_a, cs_b, is_unitdiag);
+        }
+      }
+    }
+    else if (1 == m_remainder) // Repetative A blocks will be 1*1
+    {
+      dim_t p_lda = 4; // packed leading dimension
+      if (transa)
+      {
+        for (dim_t x = 0; x < m - m_remainder; x += p_lda)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10));
+          ymm1 = _mm256_broadcast_sd((double const *)&ones);
+          ymm2 = _mm256_broadcast_sd((double const *)&ones);
+          ymm3 = _mm256_broadcast_sd((double const *)&ones);
+
+          ymm4 = _mm256_unpacklo_pd(ymm0, ymm1);
+          ymm5 = _mm256_unpacklo_pd(ymm2, ymm3);
+
+          ymm6 = _mm256_permute2f128_pd(ymm4, ymm5, 0x20);
+          ymm8 = _mm256_permute2f128_pd(ymm4, ymm5, 0x31);
+
+          ymm0 = _mm256_unpackhi_pd(ymm0, ymm1);
+          ymm1 = _mm256_unpackhi_pd(ymm2, ymm3);
+
+          ymm7 = _mm256_permute2f128_pd(ymm0, ymm1, 0x20);
+          ymm9 = _mm256_permute2f128_pd(ymm0, ymm1, 0x31);
+
+          _mm256_storeu_pd((double *)(ptr_a10_dup), ymm6);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda), ymm7);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 2), ymm8);
+          _mm256_storeu_pd((double *)(ptr_a10_dup + p_lda * 3), ymm9);
+
+          a10 += p_lda;
+          ptr_a10_dup += p_lda * p_lda;
+        }
+      }
+      else
+      {
+        for (dim_t x = 0; x < m - m_remainder; x++)
+        {
+          ymm0 = _mm256_loadu_pd((double const *)(a10 + x * rs_a));
+          _mm256_storeu_pd((double *)(ptr_a10_dup + x * p_lda), ymm0);
+        }
+      }
+      // cols
+      for (j = (n - d_nr); (j + 1) > 0; j -= d_nr) // loop along 'N' dimension
+      {
+        a10 = D_A_pack;
+        a11 = L;              // pointer to block of A to be used for TRSM
+        b01 = B + (j * cs_b) + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B + (j * cs_b);         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+        BLIS_DTRSM_SMALL_GEMM_4mx8n(a10, b01, cs_b, p_lda, k_iter)
+
+        /// GEMM code ends///
+        ymm0 = _mm256_broadcast_sd((double const *)(&AlphaVal));
+        ymm1 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 0));
+        ymm2 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 1));
+        ymm3 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 2));
+        ymm4 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 3));
+        ymm5 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 4));
+        ymm6 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 5));
+        ymm7 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 6));
+        ymm8 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 7));
+
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm0, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm0, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm0, ymm11);
+        ymm12 = _mm256_fmsub_pd(ymm4, ymm0, ymm12);
+        ymm13 = _mm256_fmsub_pd(ymm5, ymm0, ymm13);
+        ymm14 = _mm256_fmsub_pd(ymm6, ymm0, ymm14);
+        ymm15 = _mm256_fmsub_pd(ymm7, ymm0, ymm15);
+        ymm16 = _mm256_fmsub_pd(ymm8, ymm0, ymm16);
+
+        _mm_storel_pd((double *)(b11 + cs_b * 0), _mm256_extractf64x2_pd(ymm9, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 1), _mm256_extractf64x2_pd(ymm10, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 2), _mm256_extractf64x2_pd(ymm11, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 3), _mm256_extractf64x2_pd(ymm12, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 4), _mm256_extractf64x2_pd(ymm13, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 5), _mm256_extractf64x2_pd(ymm14, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 6), _mm256_extractf64x2_pd(ymm15, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 7), _mm256_extractf64x2_pd(ymm16, 0));
+
+        if (transa)
+          dtrsm_AltXB_ref(a11, b11, m_remainder, 8, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AuXB_ref(a11, b11, m_remainder, 8, rs_a, cs_b, is_unitdiag);
+      }
+      dim_t n_remainder = j + d_nr;
+      if ((n_remainder >= 4))
+      {
+        a10 = D_A_pack;
+        a11 = L;                      // pointer to block of A to be used for TRSM
+        b01 = B + ((n_remainder - 4) * cs_b) + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B + ((n_remainder - 4) * cs_b);         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+
+        BLIS_DTRSM_SMALL_GEMM_4mx4n(a10, b01, cs_b, p_lda, k_iter)
+
+        ymm16 = _mm256_broadcast_sd((double const *)(&AlphaVal)); // register to hold alpha
+
+        /// implement TRSM///
+
+        ymm0 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 0));
+        ymm1 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 1));
+        ymm2 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 2));
+        ymm3 = _mm256_broadcast_sd((double const *)(b11 + cs_b * 3));
+
+        ymm8 = _mm256_fmsub_pd(ymm0, ymm16, ymm8);
+        ymm9 = _mm256_fmsub_pd(ymm1, ymm16, ymm9);
+        ymm10 = _mm256_fmsub_pd(ymm2, ymm16, ymm10);
+        ymm11 = _mm256_fmsub_pd(ymm3, ymm16, ymm11);
+
+        _mm_storel_pd((double *)(b11), _mm256_extractf128_pd(ymm8, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 1), _mm256_extractf128_pd(ymm9, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 2), _mm256_extractf128_pd(ymm10, 0));
+        _mm_storel_pd((double *)(b11 + cs_b * 3), _mm256_extractf128_pd(ymm11, 0));
+
+        if (transa)
+          dtrsm_AltXB_ref(a11, b11, m_remainder, 4, cs_a, cs_b, is_unitdiag);
+        else
+          dtrsm_AuXB_ref(a11, b11, m_remainder, 4, rs_a, cs_b, is_unitdiag);
+        n_remainder -= 4;
+      }
+      if (n_remainder)
+      {
+        a10 = D_A_pack;
+        a11 = L;         // pointer to block of A to be used for TRSM
+        b01 = B + m_remainder; // pointer to block of B to be used for GEMM
+        b11 = B;         // pointer to block of B to be used for TRSM
+
+        k_iter = (m - m_remainder); // number of times GEMM to be performed(in blocks of 4x4)
+
+        /*Fill zeros into ymm registers used in gemm accumulations */
+        BLIS_SET_YMM_REG_ZEROS_FOR_LEFT
+
+        if (3 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx3n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_1M_3N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 3, cs_a, cs_b, is_unitdiag);
+          else 
+            dtrsm_AuXB_ref(a11, b11, m_remainder, 3, rs_a, cs_b, is_unitdiag);
+        }
+        else if (2 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx2n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_1M_2N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 2, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AuXB_ref(a11, b11, m_remainder, 2, rs_a, cs_b, is_unitdiag);
+        }
+        else if (1 == n_remainder)
+        {
+          /// GEMM code begins///
+          BLIS_DTRSM_SMALL_GEMM_4mx1n(a10, b01, cs_b, p_lda, k_iter)
+
+          BLIS_PRE_DTRSM_SMALL_1M_1N(AlphaVal, b11, cs_b)
+
+          if (transa)
+            dtrsm_AltXB_ref(a11, b11, m_remainder, 1, cs_a, cs_b, is_unitdiag);
+          else dtrsm_AuXB_ref(a11, b11, m_remainder, 1, rs_a, cs_b, is_unitdiag);
+        }
+      }
+    }
+  }
+
+  if ((required_packing_A == 1) &&
+    bli_mem_is_alloc(&local_mem_buf_A_s))
+  {
+    bli_membrk_release(&rntm, &local_mem_buf_A_s);
+  }
+  return BLIS_SUCCESS;
+}
+
+#endif
diff --git a/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c b/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c
new file mode 100644
index 0000000000..4fc69acd15
--- /dev/null
+++ b/kernels/zen4/3/bli_zgemm_zen4_asm_12x4.c
@@ -0,0 +1,1085 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+#define A_L1_PREFETCH_DIST 4 // in units of k iterations
+#define B_L1_PREFETCH_DIST 4
+#define TAIL_NITER 6
+
+#define PREFETCH_A_L1(n, k) \
+  PREFETCH(0, MEM(RAX, A_L1_PREFETCH_DIST * 24 * 8 + (2 * n + k) * 24 * 4))
+#define PREFETCH_B_L1(n, k) \
+  PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST * 8 * 8 + (2 * n + k) * 8 * 4))
+
+#define LOOP_ALIGN ALIGN32
+
+/******************************/
+/* Scale R1 register by alpha */
+/* Inputs:                    */
+/* R1 = A * B                 */
+/* ZMM0 = Alpha real          */
+/* ZMM1 = Alpha imag          */
+/* Output:                    */
+/* R1 = (Alpha) * R1          */
+/******************************/
+#define SCALE_BY_ALPHA(R1) \
+  VPERMILPD(IMM(0x55), ZMM(R1), ZMM(9)) \
+  VMULPD(ZMM0, ZMM(R1), ZMM(R1)) \
+  VMULPD(ZMM1, ZMM(9), ZMM(9)) \
+  VFMADDSUB132PD(ZMM(4), ZMM(9), ZMM(R1))
+
+/* Scale R1, R2, R3 register by alpha */
+#define SCALE3R_BY_ALPHA(R1, R2, R3) \
+  SCALE_BY_ALPHA(R1) \
+  SCALE_BY_ALPHA(R2) \
+  SCALE_BY_ALPHA(R3)
+
+/* Set R1, R2, R3 to 0 */
+#define SET_REG_TO_ZERO(R1, R2, R3) \
+  VXORPD(ZMM(R1), ZMM(R1), ZMM(R1)) \
+  VXORPD(ZMM(R2), ZMM(R2), ZMM(R2)) \
+  VXORPD(ZMM(R3), ZMM(R3), ZMM(R3))
+
+/*****************************/
+/* Scale R1 register by beta */
+/* Inputs:                   */
+/* R1 = A * B                */
+/* ZMM2 = Beta real          */
+/* ZMM3 = Beta imag          */
+/* Output:                   */
+/* R1 = (Beta) * R1          */
+/*****************************/
+#define SCALE_BY_BETA(R1) \
+  VPERMILPD(IMM(0x55), ZMM(R1), ZMM(9)) \
+  VMULPD(ZMM2, ZMM(R1), ZMM(R1)) \
+  VMULPD(ZMM3, ZMM(9), ZMM(9)) \
+  VFMADDSUB132PD(ZMM(4), ZMM(9), ZMM(R1))
+
+/***************************************/
+/* Scale R1/R2/R3 register by beta and */
+/* store the scaled value to C buffer  */
+/* Inputs:                             */
+/* R1/R2/R3 = Alpha * A * B            */
+/* RBX = beta                          */
+/* ZMM(0) = C                          */
+/* Output:                             */
+/* C = RBX * ZMM(0) + R1/R2/R3         */
+/***************************************/
+#define UPDATE_C_BETASCALE(R1, R2, R3) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  VMOVUPD(MEM(RCX, RDI, 4), ZMM(8)) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+  VMOVUPD(ZMM(R2), MEM(RCX, RDI, 4)) \
+  VMOVUPD(MEM(RCX, RDI, 8), ZMM(8)) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+  VMOVUPD(ZMM(R3), MEM(RCX, RDI, 8)) \
+  ADD(RSI, RCX)
+
+/**************************************/
+/* Add C buffer value to R1/R2/R3 reg */
+/* and store the output to C buffer   */
+/* Inputs:                            */
+/* R1/R2/R3 = Alpha * A * B           */
+/* ZMM(0) = C                         */
+/* Output:                            */
+/* C = ZMM(0) + R1/R2/R3              */
+/**************************************/
+#define UPDATE_C_BETA1(R1, R2, R3) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  VADDPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  VMOVUPD(MEM(RCX, RDI, 4), ZMM(8)) \
+  VADDPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+  VMOVUPD(ZMM(R2), MEM(RCX, RDI, 4)) \
+  VMOVUPD(MEM(RCX, RDI, 8), ZMM(8)) \
+  VADDPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+  VMOVUPD(ZMM(R3), MEM(RCX, RDI, 8)) \
+  ADD(RSI, RCX)
+
+/****************************************/
+/* Sub C buffer value with R1/R2/R3 reg */
+/* and store the output to C buffer     */
+/* Inputs:                              */
+/* R1/R2/R3 = Alpha * A * B             */
+/* ZMM(0) = C                           */
+/* Output:                              */
+/* C = -ZMM(0) + R1/R2/R3               */
+/****************************************/
+#define UPDATE_C_BETAMINUS1(R1, R2, R3) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  VSUBPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  VMOVUPD(MEM(RCX, RDI, 4), ZMM(8)) \
+  VSUBPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+  VMOVUPD(ZMM(R2), MEM(RCX, RDI, 4)) \
+  VMOVUPD(MEM(RCX, RDI, 8), ZMM(8)) \
+  VSUBPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+  VMOVUPD(ZMM(R3), MEM(RCX, RDI, 8)) \
+  ADD(RSI, RCX)
+
+/***************************************/
+/* Store R1/R2/R3 reg to C buffer      */
+/* Input:                              */
+/* R1/R2/R3 = Beta * C + Alpha * A * B */
+/* Output:                             */
+/* C = R1/R2/R3                        */
+/***************************************/
+#define STORE_C(R1, R2, R3) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  VMOVUPD(ZMM(R2), MEM(RCX, RDI, 4)) \
+  VMOVUPD(ZMM(R3), MEM(RCX, RDI, 8)) \
+  ADD(RSI, RCX)
+
+/**************************************/
+/* Scale R(1-4) register by beta and  */
+/* store the scaled value to C buffer */
+/* Inputs:                            */
+/* R(1-4) = Alpha * A * B             */
+/* RBX = beta                         */
+/* ZMM(0) = C                         */
+/* Output:                            */
+/* C = RBX * ZMM(0) + R(1-4)          */
+/**************************************/
+#define UPDATE_C_BETASCALE_ROW(R1, R2, R3, R4) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+  VMOVUPD(ZMM(R2), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+  VMOVUPD(ZMM(R3), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  EXTRACT_C_ROW(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R4), ZMM(R4)) \
+  VMOVUPD(ZMM(R4), MEM(RCX)) \
+  ADD(RSI, RCX)
+
+/************************************/
+/* Add C buffer value to R(1-4) reg */
+/* and store the output to C buffer */
+/* Inputs:                          */
+/* R(1-4)  = Alpha * A * B          */
+/* ZMM(0) = C                       */
+/* Output:                          */
+/* C = ZMM(0) + R(1-4)              */
+/************************************/
+#define UPDATE_C_BETA1_ROW(R1, R2, R3, R4) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  VADDPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  VADDPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+  VMOVUPD(ZMM(R2), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  VADDPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+  VMOVUPD(ZMM(R3), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  EXTRACT_C_ROW(8) \
+  VADDPD(ZMM(8), ZMM(R4), ZMM(R4)) \
+  VMOVUPD(ZMM(R4), MEM(RCX)) \
+  ADD(RSI, RCX)
+
+/**************************************/
+/* Sub C buffer value with R(1-4) reg */
+/* and store the output to C buffer   */
+/* Inputs:                            */
+/* R(1-4) = Alpha * A * B             */
+/* ZMM(0) = C                         */
+/* Output:                            */
+/* C = -ZMM(0) + R(1-4)               */
+/**************************************/
+#define UPDATE_C_BETAMINUS1_ROW(R1, R2, R3, R4) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  VSUBPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  VSUBPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+  VMOVUPD(ZMM(R2), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C_ROW(8) \
+  VSUBPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+  VMOVUPD(ZMM(R3), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  EXTRACT_C_ROW(8) \
+  VSUBPD(ZMM(8), ZMM(R4), ZMM(R4)) \
+  VMOVUPD(ZMM(R4), MEM(RCX)) \
+  ADD(RSI, RCX)
+
+/*************************************/
+/* Store R(1-4) reg to C buffer      */
+/* Input:                            */
+/* R(1-4) = Beta * C + Alpha * A * B */
+/* Output:                           */
+/* C = R(1-4)                        */
+/*************************************/
+#define STORE_C_ROW(R1, R2, R3, R4) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(ZMM(R2), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(ZMM(R3), MEM(RCX)) \
+  ADD(RSI, RCX) \
+  VMOVUPD(ZMM(R4), MEM(RCX)) \
+  ADD(RSI, RCX)
+
+/************************************/
+/* Extract 4 elements from C buffer */
+/* As the kernel is col major,      */
+/* elements are in col major order  */
+/* Input:                           */
+/* RCX = C                          */
+/* Output:                          */
+/* R1 = C                           */
+/************************************/
+#define EXTRACT_C_ROW(R1) \
+  VMOVUPD(MEM(RCX), XMM(R1)) \
+  VMOVUPD(MEM(RCX, RDI, 1), XMM9) \
+  VINSERTF128(IMM(1), XMM9, YMM(R1), YMM(R1)) \
+  VMOVUPD(MEM(RCX, RDI, 2), XMM9) \
+  VMOVUPD(MEM(RCX, R12, 1), XMM10) \
+  VINSERTF128(IMM(1), XMM10, YMM9, YMM9) \
+  VINSERTF64X4(IMM(1), YMM9, ZMM(R1), ZMM(R1))
+
+/**************************************/
+/* Scale R1 register by alpha and     */
+/* scale C buffer with beta and store */
+/* the output to C buffer             */
+/* Inputs:                            */
+/* R1 =  A X B                        */
+/* RAX = alpha                        */
+/* RBX = beta                         */
+/* ZMM8 = C                           */
+/* Output:                            */
+/* C = RBX * ZMM8 + RAX * R1          */
+/**************************************/
+#define UPDATE_C_ROW(R1) \
+  EXTRACT_C(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  ADD(RDI, RCX)
+
+/**************************************/
+/* Scale R(1-4) register by beta and  */
+/* store the scaled value to C buffer */
+/* Inputs:                            */
+/* R(1-4) = Alpha * A * B             */
+/* RBX = beta                         */
+/* ZMM(0) = C                         */
+/* Output:                            */
+/* C = RBX * ZMM(0) + R(1-4)          */
+/**************************************/
+#define UPDATE_C_BETASCALE_GEN(R1, R2, R3, R4) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  STORE_C_GEN(R1) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+  STORE_C_GEN(R2) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+  STORE_C_GEN(R3) \
+  ADD(RSI, RCX) \
+  EXTRACT_C(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R4), ZMM(R4)) \
+  STORE_C_GEN(R4) \
+  ADD(RSI, RCX)
+
+/************************************/
+/* Add C buffer value to R(1-4) reg */
+/* and store the output to C buffer */
+/* Inputs:                          */
+/* R(1-4)  = Alpha * A * B          */
+/* ZMM(0) = C                       */
+/* Output:                          */
+/* C = ZMM(0) + R(1-4)              */
+/************************************/
+#define UPDATE_C_BETA1_GEN(R1, R2, R3, R4) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+   EXTRACT_C(8) \
+   VADDPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+   STORE_C_GEN(R1) \
+   ADD(RSI, RCX) \
+   VMOVUPD(MEM(RCX), ZMM(8)) \
+   EXTRACT_C(8) \
+   VADDPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+   STORE_C_GEN(R2) \
+   ADD(RSI, RCX) \
+   VMOVUPD(MEM(RCX), ZMM(8)) \
+   EXTRACT_C(8) \
+   VADDPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+   STORE_C_GEN(R3) \
+   ADD(RSI, RCX) \
+   EXTRACT_C(8) \
+   VADDPD(ZMM(8), ZMM(R4), ZMM(R4)) \
+   STORE_C_GEN(R4) \
+   ADD(RSI, RCX)
+
+/**************************************/
+/* Sub C buffer value with R(1-4) reg */
+/* and store the output to C buffer   */
+/* Inputs:                            */
+/* R(1-4) = Alpha * A * B             */
+/* ZMM(0) = C                         */
+/* Output:                            */
+/* C = -ZMM(0) + R(1-4)               */
+/**************************************/
+#define UPDATE_C_BETAMINUS1_GEN(R1, R2, R3, R4) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C(8) \
+  VSUBPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  STORE_C_GEN(R1) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C(8) \
+  VSUBPD(ZMM(8), ZMM(R2), ZMM(R2)) \
+  STORE_C_GEN(R2) \
+  ADD(RSI, RCX) \
+  VMOVUPD(MEM(RCX), ZMM(8)) \
+  EXTRACT_C(8) \
+  VSUBPD(ZMM(8), ZMM(R3), ZMM(R3)) \
+  STORE_C_GEN(R3) \
+  ADD(RSI, RCX) \
+  EXTRACT_C(8) \
+  VSUBPD(ZMM(8), ZMM(R4), ZMM(R4)) \
+  STORE_C_GEN(R4) \
+  ADD(RSI, RCX)
+
+/*************************************/
+/* Store R(1-4) reg to C buffer      */
+/* Input:                            */
+/* R(1-4) = Beta * C + Alpha * A * B */
+/* Output:                           */
+/* C = R(1-4)                        */
+/*************************************/
+#define EXTRACT_STORE_C_GEN(R1, R2, R3, R4) \
+  STORE_C_GEN(R1) \
+  ADD(RSI, RCX) \
+  STORE_C_GEN(R2) \
+  ADD(RSI, RCX) \
+  STORE_C_GEN(R3) \
+  ADD(RSI, RCX) \
+  STORE_C_GEN(R4) \
+  ADD(RSI, RCX)
+
+/**********************************/
+/* Store 4 elements from C buffer */
+/* for general stride storage     */
+/* Input:                         */
+/* RCX = C                        */
+/* Output:                        */
+/* R1 = C                         */
+/**********************************/
+#define STORE_C_GEN(R1) \
+  VEXTRACTF64X2(IMM(0), ZMM(R1), XMM9) \
+  VMOVUPD(XMM9, MEM(RCX)) \
+  VEXTRACTF64X2(IMM(1), ZMM(R1), XMM9) \
+  VMOVUPD(XMM9, MEM(RCX, RDI, 1)) \
+  VEXTRACTF64X2(IMM(2), ZMM(R1), XMM9) \
+  VMOVUPD(XMM9, MEM(RCX, RDI, 2)) \
+  VEXTRACTF64X2(IMM(3), ZMM(R1), XMM9) \
+  VMOVUPD(XMM9, MEM(RCX, R12, 1))
+
+/************************************/
+/* Extract 4 elements from C buffer */
+/* As the kernel is col major,      */
+/* elements are in col major order  */
+/* Input:                           */
+/* RCX = C                          */
+/* Output:                          */
+/* R1 = C                           */
+/************************************/
+#define EXTRACT_C(R1) \
+  VMOVUPD(MEM(RCX), XMM(R1)) \
+  VMOVUPD(MEM(RCX, RDI, 1), XMM9) \
+  VINSERTF128(IMM(1), XMM9, YMM(R1), YMM(R1)) \
+  VMOVUPD(MEM(RCX, RDI, 2), XMM9) \
+  VMOVUPD(MEM(RCX, R12, 1), XMM10) \
+  VINSERTF128(IMM(1), XMM10, YMM9, YMM9) \
+  VINSERTF64X4(IMM(1), YMM9, ZMM(R1), ZMM(R1))
+
+/**********************************/
+/* Scale R1 register by alpha and */
+/* scale C buffer with beta and   */
+/* the output to C buffer         */
+/* Inputs:                        */
+/* R1 =  A X B                    */
+/* RAX = alpha                    */
+/* RBX = beta                     */
+/* ZMM8 = C                       */
+/* Output:                        */
+/* C = RBX * ZMM8 + RAX * R1      */
+/**********************************/
+#define UPDATE_C_GEN(R1) \
+  EXTRACT_C(8) \
+  SCALE_BY_BETA(8) \
+  VADDPD(ZMM(8), ZMM(R1), ZMM(R1)) \
+  VMOVUPD(ZMM(R1), MEM(RCX)) \
+  ADD(RSI, RCX)
+
+/**************************************/
+/* Scale R1 register by alpha and     */
+/* scale C buffer with beta and store */
+/* the output to C buffer             */
+/* Inputs:                            */
+/* R1 =  A * B                        */
+/* RAX = alpha                        */
+/* RBX = beta                         */
+/* ZMM(0) = C                         */
+/* Output:                            */
+/* C = RBX * ZMM(0) + RAX * R1        */
+/* we operate 12x4 block at a time    */
+/**************************************/
+#define SUBITER(n) \
+  /*PREFETCH_A_L1(n, 0)  */ \
+  VBROADCASTSD(MEM(RBX, (8 * n + 2) * 8), ZMM(3)) \
+  VFMADD231PD(ZMM(0), ZMM(29), ZMM(5)) \
+  VFMADD231PD(ZMM(1), ZMM(29), ZMM(6)) \
+  VFMADD231PD(ZMM(2), ZMM(29), ZMM(7)) \
+  VBROADCASTSD(MEM(RBX, (8 * n + 3) * 8), ZMM(4)) \
+  VFMADD231PD(ZMM(0), ZMM(30), ZMM(8)) \
+  VFMADD231PD(ZMM(1), ZMM(30), ZMM(9)) \
+  VFMADD231PD(ZMM(2), ZMM(30), ZMM(10)) \
+\
+  /*PREFETCH_B_L1(n, 0)    */ \
+  VBROADCASTSD(MEM(RBX, (8 * n + 4) * 8), ZMM(29)) \
+  VFMADD231PD(ZMM(0), ZMM(3), ZMM(11)) \
+  VFMADD231PD(ZMM(1), ZMM(3), ZMM(12)) \
+  VFMADD231PD(ZMM(2), ZMM(3), ZMM(13)) \
+  VBROADCASTSD(MEM(RBX, (8 * n + 5) * 8), ZMM(30)) \
+  VFMADD231PD(ZMM(0), ZMM(4), ZMM(14)) \
+  VFMADD231PD(ZMM(1), ZMM(4), ZMM(15)) \
+  VFMADD231PD(ZMM(2), ZMM(4), ZMM(16)) \
+\
+  /*PREFETCH_A_L1(n, 1)  */ \
+  VBROADCASTSD(MEM(RBX, (8 * n + 6) * 8), ZMM(3)) \
+  VFMADD231PD(ZMM(0), ZMM(29), ZMM(17)) \
+  VFMADD231PD(ZMM(1), ZMM(29), ZMM(18)) \
+  VFMADD231PD(ZMM(2), ZMM(29), ZMM(19)) \
+  VBROADCASTSD(MEM(RBX, (8 * n + 7) * 8), ZMM(4)) \
+  VFMADD231PD(ZMM(0), ZMM(30), ZMM(20)) \
+  VFMADD231PD(ZMM(1), ZMM(30), ZMM(21)) \
+  VFMADD231PD(ZMM(2), ZMM(30), ZMM(22)) \
+\
+  /*PREFETCH_B_L1(n, 1) */ \
+  VBROADCASTSD(MEM(RBX, (8 * n + 8) * 8), ZMM(29)) \
+  VFMADD231PD(ZMM(0), ZMM(3), ZMM(23)) \
+  VFMADD231PD(ZMM(1), ZMM(3), ZMM(24)) \
+  VFMADD231PD(ZMM(2), ZMM(3), ZMM(25)) \
+  VBROADCASTSD(MEM(RBX, (8 * n + 9) * 8), ZMM(30)) \
+  VFMADD231PD(ZMM(0), ZMM(4), ZMM(26)) \
+  VMOVAPD(MEM(RAX, (12 * n + 0) * 16), ZMM(0)) \
+  VFMADD231PD(ZMM(1), ZMM(4), ZMM(27)) \
+  VMOVAPD(MEM(RAX, (12 * n + 4) * 16), ZMM(1)) \
+  VFMADD231PD(ZMM(2), ZMM(4), ZMM(28)) \
+  VMOVAPD(MEM(RAX, (12 * n + 8) * 16), ZMM(2))
+
+  /*********************************************/
+  /* Transpose contents of R0, R1 , R2, R3 and */
+  /* store the result to same register         */
+  /*********************************************/
+#define TRANSPOSE(R0, R1, R2, R3) \
+  VSHUFF64X2(IMM(0x88), ZMM(R1), ZMM(R0), ZMM(26)) \
+  VSHUFF64X2(IMM(0xDD), ZMM(R1), ZMM(R0), ZMM(27)) \
+  VSHUFF64X2(IMM(0x88), ZMM(R3), ZMM(R2), ZMM(28)) \
+  VSHUFF64X2(IMM(0xDD), ZMM(R3), ZMM(R2), ZMM(29)) \
+  VSHUFF64X2(IMM(0x88), ZMM(28), ZMM(26), ZMM(R0)) \
+  VSHUFF64X2(IMM(0xDD), ZMM(28), ZMM(26), ZMM(R2)) \
+  VSHUFF64X2(IMM(0x88), ZMM(29), ZMM(27), ZMM(R1)) \
+  VSHUFF64X2(IMM(0xDD), ZMM(29), ZMM(27), ZMM(R3))
+
+
+// This array is used to support ADDSUB instruction.
+static double offsets[8] __attribute__((aligned(64)))
+                                 = {1, 1, 1, 1, 1, 1, 1, 1};
+
+/**********************************************************/
+/* Kernel : bli_zgemm_zen4_asm_12x4                       */
+/* It performs  C = C * beta + alpha * A * B              */
+/* It is col preferred kernel, A and B are packed         */
+/* C could be Row/Col/Gen Stored Matrix                   */
+/* Registers are allocated as below                       */
+/* Load A :  ZMM(0-2)                                     */
+/* Pre Broadcast B :  ZMM(29,30)                          */
+/* Broadcast B :  ZMM(3,4)                                */
+/* Accumulation of A(real,imag)*Breal :                   */
+/*       ZMM(5-7,11-13,17-19,23-25)                       */
+/* Accumulation of A(real,imag)*Bimag :                   */
+/*       ZMM(8-10,14-16,20-22,26-28)                      */
+/* Computation of A(real,imag)*B(real,imag):              */
+/*       ZMM(5-7,11-13,17-19,23-25)                       */
+/* Registers used for load and brodcast could be          */
+/* used for alpha, beta scaling                           */
+/* alphar : ZMM0, alphai : ZMM1                           */
+/* betar  : ZMM2, betai  : ZMM3                           */
+/* Techinques used in kernel                              */
+/* 1. k loop is sub divided in to 4 loops                 */
+/*    a. iter = k/4-TAIL_NITER-4,  ZMM = A*B              */
+/*    b. iter = 4, ZMM = A*B,                             */
+/*       Prefetch C mem in anticipation of a write.       */
+/*    c. iter = TAIL_NITER-4, ZMM = A*B                   */
+/*    All above loops is unrolled 4times                  */
+/*    d. iter = k%4, ZMM = A*B, k remainder is executed   */
+/* 2. If alpha/beta imag = 0 and alpha/beta real = 0/1/-1 */
+/*    Scale with real value(Should not be 0/1/-1)         */
+/**********************************************************/
+void bli_zgemm_zen4_asm_12x4(
+    dim_t k0,
+    dcomplex *restrict alpha,
+    dcomplex *restrict a,
+    dcomplex *restrict b,
+    dcomplex *restrict beta,
+    dcomplex *restrict c, inc_t rs_c0, inc_t cs_c0,
+    auxinfo_t *data,
+    cntx_t *restrict cntx)
+{
+  AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+  const uint64_t k = k0;
+  /*rowstride * size of one dcomplex element*/
+  uint64_t rs_c = rs_c0 * 16;
+  /*colstride * size of one dcomplex element*/
+  uint64_t cs_c = cs_c0 * 16;
+  const double *offsetPtr = &offsets[0];
+
+  uint64_t alpha_mul_type = BLIS_MUL_DEFAULT;
+  uint64_t beta_mul_type = BLIS_MUL_DEFAULT;
+
+  if (alpha->imag == 0.0)
+  {
+    if (alpha->real == 1.0)
+      alpha_mul_type = BLIS_MUL_ONE;
+    else if (alpha->real == -1.0)
+      alpha_mul_type = BLIS_MUL_MINUS_ONE;
+    else if (alpha->real == 0.0)
+      alpha_mul_type = BLIS_MUL_ZERO;
+  }
+
+  if (beta->imag == 0.0)
+  {
+    if (beta->real == 1.0)
+      beta_mul_type = BLIS_MUL_ONE ;
+    else if (beta->real == -1.0)
+      beta_mul_type = BLIS_MUL_MINUS_ONE;
+    else if (beta->real == 0.0)
+      beta_mul_type = BLIS_MUL_ZERO;
+  }
+
+  BEGIN_ASM()
+
+  // Initialise accumulation registers to zero
+  VXORPD(ZMM(5), ZMM(5), ZMM(5))
+  VXORPD(ZMM(6), ZMM(6), ZMM(6))
+  VXORPD(ZMM(7), ZMM(7), ZMM(7))
+  VXORPD(ZMM(8), ZMM(8), ZMM(8))
+  VXORPD(ZMM(9), ZMM(9), ZMM(9))
+  VXORPD(ZMM(10), ZMM(10), ZMM(10))
+  VXORPD(ZMM(11), ZMM(11), ZMM(11))
+  VXORPD(ZMM(12), ZMM(12), ZMM(12))
+  VXORPD(ZMM(13), ZMM(13), ZMM(13))
+  VXORPD(ZMM(14), ZMM(14), ZMM(14))
+  VXORPD(ZMM(15), ZMM(15), ZMM(15))
+  VXORPD(ZMM(16), ZMM(16), ZMM(16))
+  VXORPD(ZMM(17), ZMM(17), ZMM(17))
+  VXORPD(ZMM(18), ZMM(18), ZMM(18))
+  VXORPD(ZMM(19), ZMM(19), ZMM(19))
+  VXORPD(ZMM(20), ZMM(20), ZMM(20))
+  VXORPD(ZMM(21), ZMM(21), ZMM(21))
+  VXORPD(ZMM(22), ZMM(22), ZMM(22))
+  VXORPD(ZMM(23), ZMM(23), ZMM(23))
+  VXORPD(ZMM(24), ZMM(24), ZMM(24))
+  VXORPD(ZMM(25), ZMM(25), ZMM(25))
+  VXORPD(ZMM(26), ZMM(26), ZMM(26))
+  VXORPD(ZMM(27), ZMM(27), ZMM(27))
+  VXORPD(ZMM(28), ZMM(28), ZMM(28))
+
+  MOV(VAR(k), RSI)
+
+  // load address of buff to reg
+  MOV(VAR(a), RAX)
+  MOV(VAR(b), RBX)
+  MOV(VAR(c), RCX)
+
+  // load R9 with address of C buff to be used during prefetch
+  MOV(RCX, R9)
+  ADD(IMM(63), R9)
+
+  // pre-load first 12 elements of a to ZMM(0-2)
+  VMOVAPD(MEM(RAX, 0 * 16), ZMM(0))
+  VMOVAPD(MEM(RAX, 4 * 16), ZMM(1))
+  VMOVAPD(MEM(RAX, 8 * 16), ZMM(2))
+  // broadcast breal to ZMM29 and bimag to ZMM30
+  VBROADCASTSD(MEM(RBX, 0), ZMM(29))
+  VBROADCASTSD(MEM(RBX, 8), ZMM(30))
+  LEA(MEM(RAX, 12 * 16), RAX) // adjust a after pre-load
+
+  MOV(VAR(cs_c), R10)
+
+  MOV(RSI, RDI)
+  AND(IMM(3), RSI)
+  SAR(IMM(2), RDI)
+
+  /******************************************************************/
+  /* Operation:                                                     */
+  /* SUBITER = (Ar, Ai)*(Br, Bi) = (Ar, Ai)*Br , (Ar, Ai)*Bi        */
+  /* ZMMR1 = (Ar*Br, Ai*Br), ZMMR2 = (Ar*Bi, Ai*Bi)                 */
+  /* ITER_K_LOOP: Loop count depends on k and TAIL_NITER            */
+  /*              iter = k/4 - 4 - TAIL_NITER                       */
+  /* ITER_4: Fixed loop executed 4 times hence iter = 4             */
+  /* TAILNITER: Fixed loop executed TAIL_NITER times hence          */
+  /*            iter = TAIL_NITER                                   */
+  /* Tail: Leftover k values are executed here, iter = k%4          */
+  /* k loop is divided in above way to have a fixed distance to     */
+  /* prefetch C.                                                    */
+  /******************************************************************/
+  SUB(IMM(4 + TAIL_NITER), RDI)
+  JLE(K_REMAINDER)
+
+  LOOP_ALIGN
+  /*******************************************************/
+  /* ITER_K_LOOP: iter = k/4 - 4 - TAIL_NITER            */
+  /* (Ar, Ai)*(Br, Bi) is executed                       */
+  /* Loop is unrolled 4 times                            */
+  /*******************************************************/
+  LABEL(ITER_K_LOOP)
+
+  SUBITER(0)
+  SUBITER(1)
+  SUB(IMM(1), RDI)
+  SUBITER(2)
+  SUBITER(3)
+
+  LEA(MEM(RAX, 4 * 12 * 16), RAX)
+  LEA(MEM(RBX, 4 * 4 * 16), RBX)
+
+  JNZ(ITER_K_LOOP)
+
+  LABEL(K_REMAINDER)
+
+  ADD(IMM(4), RDI)
+  JLE(TAILNITER)
+
+  LOOP_ALIGN
+  /*******************************************************/
+  /* ITER_4: iter = 4                                    */
+  /* (Ar, Ai)*(Br, Bi) is executed                       */
+  /* C is prefetched to L1/L2 cache line with            */
+  /* anticipation of write                               */
+  /* Loop is unrolled 4 times                            */
+  /*******************************************************/
+  LABEL(ITER_4)
+
+  PREFETCHW0(MEM(R9))
+  SUBITER(0)
+
+  SUBITER(1)
+  PREFETCHW0(MEM(R9, 64))
+
+  SUB(IMM(1), RDI)
+  SUBITER(2)
+  PREFETCHW0(MEM(R9, 128))
+  SUBITER(3)
+
+  LEA(MEM(RAX, 4 * 12 * 16), RAX)
+  LEA(MEM(RBX, 4 * 4 * 16), RBX)
+  LEA(MEM(R9, R10, 1), R9)
+
+  JNZ(ITER_4)
+
+  /*******************************************************/
+  /* TAILNITER: iter = TAILNITER                         */
+  /* (Ar, Ai)*(Br, Bi) is executed                       */
+  /* Loop is unrolled 4 times                            */
+  /*******************************************************/
+  LABEL(TAILNITER)
+
+  ADD(IMM(0 + TAIL_NITER), RDI)
+  JLE(TAIL)
+
+  LOOP_ALIGN
+  LABEL(TAILNITER_LOOP)
+
+  SUBITER(0)
+  SUBITER(1)
+  SUB(IMM(1), RDI)
+  SUBITER(2)
+  SUBITER(3)
+
+  LEA(MEM(RAX, 4 * 12 * 16), RAX)
+  LEA(MEM(RBX, 4 * 4 * 16), RBX)
+
+  JNZ(TAILNITER_LOOP)
+
+  LABEL(TAIL)
+
+  TEST(RSI, RSI)
+  JZ(POSTACCUM)
+
+  LOOP_ALIGN
+  /*******************************************************/
+  /* TAILNITER: iter = k%4                               */
+  /* (Ar, Ai)*(Br, Bi) is executed                       */
+  /*******************************************************/
+  LABEL(TAIL_LOOP)
+
+  SUB(IMM(1), RSI)
+  SUBITER(0)
+  LEA(MEM(RAX, 12 * 16), RAX)
+  LEA(MEM(RBX, 4 * 16), RBX)
+
+  JNZ(TAIL_LOOP)
+
+  LABEL(POSTACCUM)
+
+  /**************************************************/
+  /* Permute imag component register. Shuffle even  */
+  /* and odd components                             */
+  /* SRC: ZMM8 =(Ar0*Bi0, Ai0*Bi0, Ar1*Bi0, Ai1*Bi0)*/
+  /* DST: ZMM8 =(Ai0*Bi0, Ar0*Bi0, Ai1*Bi0, Ar1*Bi0)*/
+  /**************************************************/
+  VPERMILPD(IMM(0x55), ZMM8, ZMM8)
+  VPERMILPD(IMM(0x55), ZMM9, ZMM9)
+  VPERMILPD(IMM(0x55), ZMM10, ZMM10)
+  VPERMILPD(IMM(0x55), ZMM14, ZMM14)
+  VPERMILPD(IMM(0x55), ZMM15, ZMM15)
+  VPERMILPD(IMM(0x55), ZMM16, ZMM16)
+  VPERMILPD(IMM(0x55), ZMM20, ZMM20)
+  VPERMILPD(IMM(0x55), ZMM21, ZMM21)
+  VPERMILPD(IMM(0x55), ZMM22, ZMM22)
+  VPERMILPD(IMM(0x55), ZMM26, ZMM26)
+  VPERMILPD(IMM(0x55), ZMM27, ZMM27)
+  VPERMILPD(IMM(0x55), ZMM28, ZMM28)
+
+  MOV(VAR(offsetPtr), R14)
+  VMOVAPD(MEM(R14), ZMM(0))
+  /***************************************************/
+  /* SRC: ZMM5 = (Ar0*Br0, Ai0*Br0, Ar1*Br0, Ai1*Br0)*/
+  /* SRC: ZMM8 = (Ai0*Bi0, Ar0*Bi0, Ai1*Bi0, Ar1*Bi0)*/
+  /* DST: ZMM5 =(Ar0*Br0-Ai0*Bi0, Ai0*Br0+Ar0*Bi0,   */
+  /*             Ar1*Br0-Ai1*Bi0, Ai1*Br0+Ar1*Bi0)   */
+  /***************************************************/
+  VFMADDSUB132PD(ZMM(0), ZMM(8), ZMM(5))
+  VFMADDSUB132PD(ZMM(0), ZMM(9), ZMM(6))
+  VFMADDSUB132PD(ZMM(0), ZMM(10), ZMM(7))
+  VFMADDSUB132PD(ZMM(0), ZMM(14), ZMM(11))
+  VFMADDSUB132PD(ZMM(0), ZMM(15), ZMM(12))
+  VFMADDSUB132PD(ZMM(0), ZMM(16), ZMM(13))
+  VFMADDSUB132PD(ZMM(0), ZMM(20), ZMM(17))
+  VFMADDSUB132PD(ZMM(0), ZMM(21), ZMM(18))
+  VFMADDSUB132PD(ZMM(0), ZMM(22), ZMM(19))
+  VFMADDSUB132PD(ZMM(0), ZMM(26), ZMM(23))
+  VFMADDSUB132PD(ZMM(0), ZMM(27), ZMM(24))
+  VFMADDSUB132PD(ZMM(0), ZMM(28), ZMM(25))
+
+  LABEL(STORE)
+  MOV(VAR(offsetPtr), RDI)
+  VMOVAPD(MEM(RDI), ZMM(4))
+  /*Load alpha and beta values*/
+  MOV(VAR(alpha), RAX)
+  VBROADCASTSD(MEM(RAX, 0), ZMM(0))
+  VBROADCASTSD(MEM(RAX, 8), ZMM(1))
+  MOV(VAR(beta), RBX)
+  VBROADCASTSD(MEM(RBX, 0), ZMM(2))
+  VBROADCASTSD(MEM(RBX, 8), ZMM(3))
+   /************************************************/
+  /* C = (betaR, betaI)*(C)+(alphaR, alphaI)*(A*B) */
+  /* ALPHA_SCALE: C = CInter1 + CInter2            */
+  /* When alphaI=0                                 */
+  /* ALPHA_ZERO:     alphaR=0  => CInter2 = 0      */
+  /* ALPHA_REAL_ONE: alphaR=1  => CInter2 = A*B    */
+  /* ALPHA_MINUS_ONE:alphaR=-1 => CInter2 = -A*B   */
+  /*************************************************/
+  MOV(VAR(alpha_mul_type), R14)
+
+  CMP(IMM(1), R14) // Check if alpha = 1.0
+  JE(ALPHA_SCALE_DONE)
+
+  CMP(IMM(0), R14) // Check if alpha = 0.0
+  JE(ALPHA_ZERO)
+
+  LABEL(ALPHA_SCALE)
+  CMP(IMM(2), R14) // Check for BLIS_MUL_DEFAULT
+
+  JNE(ALPHA_MINUS_ONE)
+  SCALE3R_BY_ALPHA(5, 6, 7)
+  SCALE3R_BY_ALPHA(11, 12, 13)
+  SCALE3R_BY_ALPHA(17, 18, 19)
+  SCALE3R_BY_ALPHA(23, 24, 25)
+  JMP(ALPHA_SCALE_DONE)
+
+  LABEL(ALPHA_MINUS_ONE)
+  VXORPD(ZMM8, ZMM8, ZMM8)
+  VSUBPD(ZMM(5), ZMM(8), ZMM(5))
+  VSUBPD(ZMM(6), ZMM(8), ZMM(6))
+  VSUBPD(ZMM(7), ZMM(8), ZMM(7))
+  VSUBPD(ZMM(11), ZMM(8), ZMM(11))
+  VSUBPD(ZMM(12), ZMM(8), ZMM(12))
+  VSUBPD(ZMM(13), ZMM(8), ZMM(13))
+  VSUBPD(ZMM(17), ZMM(8), ZMM(17))
+  VSUBPD(ZMM(18), ZMM(8), ZMM(18))
+  VSUBPD(ZMM(19), ZMM(8), ZMM(19))
+  VSUBPD(ZMM(23), ZMM(8), ZMM(23))
+  VSUBPD(ZMM(24), ZMM(8), ZMM(24))
+  VSUBPD(ZMM(25), ZMM(8), ZMM(25))
+  JMP(ALPHA_SCALE_DONE)
+
+  LABEL(ALPHA_ZERO)
+  SET_REG_TO_ZERO(5, 6, 7)
+  SET_REG_TO_ZERO(11, 12, 13)
+  SET_REG_TO_ZERO(17, 18, 19)
+  SET_REG_TO_ZERO(23, 24, 25)
+
+  LABEL(ALPHA_SCALE_DONE)
+  MOV(VAR(rs_c), RDI)
+  LEA(MEM(RDI, RDI, 2), R12)
+  MOV(VAR(cs_c), RSI)
+
+  CMP(IMM(16), RDI) // Check if C is column stored
+
+  JNZ(ROWSTORED) // Jump to row stored
+  /************************************************/
+  /* C = (betaR, betaI)*(C)+(alphaR, alphaI)*(A*B)*/
+  /* BETA_SCALE : C = CInter1 + CInter2           */
+  /* When betaI = 0                               */
+  /* BETAZERO:    betaR=0  => CInter1 = 0         */
+  /* BETA_ONE:    betaR=1  => CInter1 = C         */
+  /* BETA_MINUS1: betaR=-1 => CInter1 = -C        */
+  /************************************************/
+  MOV(VAR(beta_mul_type), R14)
+  CMP(IMM(0), R14) // Check if betaR = 0.0
+  JE(BETAZERO)
+
+  CMP(IMM(1), R14)
+  JE(BETA_ONE) // Check if betaR = 1.0
+
+  CMP(IMM(2), R14) // Check for betaR = AnyValue(It should not be 0,1,-1)
+  JE(BETA_SCALE)
+
+  LABEL(BETA_MINUS1)
+  UPDATE_C_BETAMINUS1(5, 6, 7)
+  UPDATE_C_BETAMINUS1(11, 12, 13)
+  UPDATE_C_BETAMINUS1(17, 18, 19)
+  UPDATE_C_BETAMINUS1(23, 24, 25)
+  JMP(END)
+
+  LABEL(BETA_ONE)
+  UPDATE_C_BETA1(5, 6, 7)
+  UPDATE_C_BETA1(11, 12, 13)
+  UPDATE_C_BETA1(17, 18, 19)
+  UPDATE_C_BETA1(23, 24, 25)
+  JMP(END)
+
+  LABEL(BETA_SCALE)
+  UPDATE_C_BETASCALE(5, 6, 7)
+  UPDATE_C_BETASCALE(11, 12, 13)
+  UPDATE_C_BETASCALE(17, 18, 19)
+  UPDATE_C_BETASCALE(23, 24, 25)
+  JMP(END)
+
+  LABEL(BETAZERO)
+  STORE_C(5, 6, 7)
+  STORE_C(11, 12, 13)
+  STORE_C(17, 18, 19)
+  STORE_C(23, 24, 25)
+  JMP(END)
+
+  LABEL(ROWSTORED)
+  CMP(IMM(16), RSI) // Check if C is row stored
+  JNZ(GENSTORED) // Jump to gen stored
+  MOV(VAR(cs_c), RDI)
+  MOV(VAR(rs_c), RSI)
+  LEA(MEM(RDI, RDI, 2), R12)    // r12 =  3*rs_c;
+
+  TRANSPOSE(5, 11, 17, 23)
+  TRANSPOSE(6, 12, 18, 24)
+  TRANSPOSE(7, 13, 19, 25)
+
+  /************************************************/
+  /* C = (betaR, betaI)*(C)+(alphaR, alphaI)*(A*B)*/
+  /* BETA_SCALE : C = CInter1 + CInter2           */
+  /* When betaI = 0                               */
+  /* BETAZERO:    betaR=0  => CInter1 = 0         */
+  /* BETA_ONE:    betaR=1  => CInter1 = C         */
+  /* BETA_MINUS1: betaR=-1 => CInter1 = -C        */
+  /************************************************/
+  MOV(VAR(beta_mul_type), R14)
+  CMP(IMM(0), R14) // Check if betaR = 0.0
+  JE(BETAZERO_ROW)
+
+  CMP(IMM(1), R14)
+  JE(BETA_ONE_ROW) // Check if betaR = 1.0
+
+  CMP(IMM(2), R14) // Check for betaR = AnyValue(It should not be 0,1,-1)
+  JE(BETA_SCALE_ROW)
+
+  LABEL(BETA_MINUS1_ROW)
+  UPDATE_C_BETAMINUS1_ROW(5, 11, 17, 23)
+  UPDATE_C_BETAMINUS1_ROW(6, 12, 18, 24)
+  UPDATE_C_BETAMINUS1_ROW(7, 13, 19, 25)
+  JMP(END)
+
+  LABEL(BETA_ONE_ROW)
+  UPDATE_C_BETA1_ROW(5, 11, 17, 23)
+  UPDATE_C_BETA1_ROW(6, 12, 18, 24)
+  UPDATE_C_BETA1_ROW(7, 13, 19, 25)
+  JMP(END)
+
+  LABEL(BETA_SCALE_ROW)
+  UPDATE_C_BETASCALE_ROW(5, 11, 17, 23)
+  UPDATE_C_BETASCALE_ROW(6, 12, 18, 24)
+  UPDATE_C_BETASCALE_ROW(7, 13, 19, 25)
+  JMP(END)
+
+  LABEL(BETAZERO_ROW)
+  STORE_C_ROW(5, 11, 17, 23)
+  STORE_C_ROW(6, 12, 18, 24)
+  STORE_C_ROW(7, 13, 19, 25)
+  JMP(END)
+
+  LABEL(GENSTORED)
+  MOV(VAR(rs_c), RSI)
+  MOV(VAR(cs_c), RDI)
+  LEA(MEM(RDI, RDI, 2), R12)
+
+  TRANSPOSE(5, 11, 17, 23)
+  TRANSPOSE(6, 12, 18, 24)
+  TRANSPOSE(7, 13, 19, 25)
+
+  /************************************************/
+  /* C = (betaR, betaI)*(C)+(alphaR, alphaI)*(A*B)*/
+  /* BETA_SCALE : C = CInter1 + CInter2           */
+  /* When betaI = 0                               */
+  /* BETAZERO:    betaR=0  => CInter1 = 0         */
+  /* BETA_ONE:    betaR=1  => CInter1 = C         */
+  /* BETA_MINUS1: betaR=-1 => CInter1 = -C        */
+  /************************************************/
+  MOV(VAR(beta_mul_type), R14)
+  CMP(IMM(0), R14) // Check if betaR = 0.0
+  JE(BETAZERO_GEN)
+  CMP(IMM(2), R14) // Check for betaR = AnyValue(It should not be 0,1,-1)
+  JE(BETA_SCALE_GEN)
+  CMP(IMM(1), R14)
+  JE(BETA_ONE_GEN) // Check if betaR = 1.0
+
+  LABEL(BETA_MINUS1_GEN)
+  UPDATE_C_BETAMINUS1_GEN(5, 11, 17, 23)
+  UPDATE_C_BETAMINUS1_GEN(6, 12, 18, 24)
+  UPDATE_C_BETAMINUS1_GEN(7, 13, 19, 25)
+  JMP(END)
+
+  LABEL(BETA_ONE_GEN)
+  UPDATE_C_BETA1_GEN(5, 11, 17, 23)
+  UPDATE_C_BETA1_GEN(6, 12, 18, 24)
+  UPDATE_C_BETA1_GEN(7, 13, 19, 25)
+  JMP(END)
+
+  LABEL(BETA_SCALE_GEN)
+  UPDATE_C_BETASCALE_GEN(5, 11, 17, 23)
+  UPDATE_C_BETASCALE_GEN(6, 12, 18, 24)
+  UPDATE_C_BETASCALE_GEN(7, 13, 19, 25)
+  JMP(END)
+
+  LABEL(BETAZERO_GEN)
+  EXTRACT_STORE_C_GEN(5, 11, 17, 23)
+  EXTRACT_STORE_C_GEN(6, 12, 18, 24)
+  EXTRACT_STORE_C_GEN(7, 13, 19, 25)
+  JMP(END)
+
+  LABEL(END)
+  VZEROUPPER()
+
+  end_asm(
+    :                 // output operands (none)
+    :                 // input operands
+    [a] "m"(a),       // 1
+    [k] "m"(k),       // 2
+    [b] "m"(b),       // 3
+    [c] "m"(c),       // 8
+    [rs_c] "m"(rs_c), // 9
+    [cs_c] "m"(cs_c), // 10,
+    [alpha] "m"(alpha),
+    [beta] "m"(beta),
+    [offsetPtr] "m"(offsetPtr),
+    [alpha_mul_type] "m"(alpha_mul_type),
+    [beta_mul_type] "m"(beta_mul_type)
+    : // register clobber list
+    "rax", "rbx", "rcx", "rdi", "rsi", "r9", "r10", "r12", "r14",
+    "xmm8", "xmm9", "xmm10",
+    "ymm8", "ymm9",
+    "zmm0", "zmm1", "zmm2",
+    "zmm3", "zmm4", "zmm5", "zmm6",  "zmm7", "zmm8",
+    "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14",
+    "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20",
+    "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+    "zmm27", "zmm28", "zmm29", "zmm30", "zmm31", "memory"
+  )
+
+  AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/CMakeLists.txt b/kernels/zen4/3/sup/CMakeLists.txt
new file mode 100644
index 0000000000..81e194ef64
--- /dev/null
+++ b/kernels/zen4/3/sup/CMakeLists.txt
@@ -0,0 +1,21 @@
+##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.##
+
+add_library(zen4_3sup
+     OBJECT
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64.h
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64m.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rd_zen_s6x64n.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64.h
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64m.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_rv_zen_s6x64n.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_24x8m.c
+     ${CMAKE_CURRENT_SOURCE_DIR}/bli_gemmsup_cv_zen4_z12x4m.c
+)
+target_compile_options(zen4_3sup PRIVATE /arch:AVX2 /arch:AVX512)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen4_3sup PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
+
+add_subdirectory(d24x8)
diff --git a/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c
new file mode 100644
index 0000000000..97ac0985dc
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_dgemmsup_rv_zen4_asm_24x8m.c
@@ -0,0 +1,9692 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/\
+\
+    vfmadd231pd( mem(rcx, rsi, 1),zmm31,zmm4 ) \
+    vmovupd( zmm4, (rcx, rsi, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 2),zmm31,zmm2 ) \
+    vmovupd( zmm2, (rcx, rsi, 2) )\
+\
+    vfmadd231pd( mem(rcx, r12, 1),zmm31,zmm6 ) \
+    vmovupd( zmm6, (rcx, r12, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 4),zmm31,zmm1 ) \
+    vmovupd( zmm1, (rcx, rsi, 4) )\
+\
+    vfmadd231pd( mem(rcx, r13, 1),zmm31,zmm5 ) \
+    vmovupd( zmm5, (rcx, r13, 1) )\
+\
+    vfmadd231pd( mem(rcx, r12, 2),zmm31,zmm3 ) \
+    vmovupd( zmm3, (rcx, r12, 2) )\
+\
+    vfmadd231pd( mem(rcx, rdx, 1),zmm31,zmm8 ) \
+    vmovupd( zmm8, (rcx, rdx, 1) )\
+    add(r14, rcx)
+
+
+/**
+ * stores FMA result to C.
+*/
+#define UPDATE_C_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+\
+    vmovupd( zmm4, (rcx, rsi, 1) ) \
+\
+    vmovupd( zmm2, (rcx, rsi, 2) ) \
+\
+    vmovupd( zmm6, (rcx, r12, 1) ) \
+\
+    vmovupd( zmm1, (rcx, rsi, 4) ) \
+\
+    vmovupd( zmm5, (rcx, r13, 1) ) \
+\
+    vmovupd( zmm3, (rcx, r12, 2) ) \
+\
+    vmovupd( zmm8, (rcx, rdx, 1) ) \
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(2) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(2) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(2) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(2) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(2) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(2) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(2) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(2) ) \
+    vfmadd231pd( zmm31,zmm12,zmm8 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(2)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(2)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(2)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(2)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(2)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(2)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(2)))\
+    vmovupd( zmm8, (rcx, rdx, 1) MASK_(k(2)))\
+    add(r14, rcx)
+
+/**
+ * mask register is set, stores FMA result to C.
+*/
+#define UPDATE_MASKED_C_BZ \
+\
+    vmovupd( zmm0, mem(rcx) MASK_(k(2))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(2))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(2)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(2)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(2))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(2))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(2))) \
+\
+    vmovupd( zmm8, mem(rcx, rdx, 1) MASK_(k(2))) \
+    add(r14, rcx)
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x8m
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    double *abuf = a;
+    double *bbuf = b;
+    double *cbuf = c;
+
+    // n0 is actually n_left which is calculated at JR loop.
+    uint64_t n_left = (uint64_t)n0 % 8;
+
+    // First check whether this is a edge case in the n dimension. If so,
+    // dispatch other nx? kernels, as needed
+    if( n_left )
+    {
+        dgemmsup_ker_ft ker_fps[8] =
+        {
+          NULL,
+          bli_dgemmsup_rv_zen4_asm_24x1m,
+          bli_dgemmsup_rv_zen4_asm_24x2m,
+          bli_dgemmsup_rv_zen4_asm_24x3m,
+          bli_dgemmsup_rv_zen4_asm_24x4m,
+          bli_dgemmsup_rv_zen4_asm_24x5m,
+          bli_dgemmsup_rv_zen4_asm_24x6m,
+          bli_dgemmsup_rv_zen4_asm_24x7m,
+        };
+
+        dgemmsup_ker_ft ker_fp = ker_fps[ n_left ];
+
+        ker_fp
+        (
+          conja, conjb, m0, n_left, k0,
+          alpha, abuf, rs_a0, cs_a0, bbuf, rs_b0, cs_b0,
+          beta, cbuf, rs_c0, cs_c0, data, cntx
+        );
+
+        return;
+    }
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t m_iter = (uint64_t)m0 / 24;
+    uint64_t m_left = (uint64_t)m0 % 24;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /* For one iteration of this loop, a block of MRxNR is computed
+     * This loop moves along m-dimension of c matrix with steps of MR*rs_c.
+     */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+
+        a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR)
+        b = bbuf;  //Same KCXNR is used across different MRXKC in MCXKC
+        c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR)
+
+        // -------------------------------------------------------------------------
+        begin_asm()
+
+        mov(var(a), rax)                // load address of a
+        mov(var(cs_a), r10)             // load cs_a
+        mov(var(b), rbx)                // load address of b
+        mov(var(rs_b), r8)              // load rs_b
+        mov(var(cs_b), r9)              // load cs_b
+        mov(var(c), rcx)                // load address of c
+        mov(var(cs_c), rdi)             // load cs_c
+        lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+        lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+        lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+        lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+        lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+        // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+        //is also used to traverse B matrix
+        lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+        lea(mem(rcx, 7*8), rdx)         // C for prefetching
+        mov(var(ps_a8), r14)            // panel stride of A
+        lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A
+        lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+        // if n > 4, a second pointer which point to r11 + 4*cs_b
+        //is also used to prefetch from B matrix
+        lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+        /* Register usage: zmm0-5 are used to load A matrix
+         *                 zmm6-29 are used for accumulation
+         *                 zmm30-31 are used for broadcasting B matrix
+         */
+
+        // zero out all accumulation registers
+        vxorpd(zmm6, zmm6, zmm6)
+        vxorpd(zmm7, zmm7, zmm7)
+        vxorpd(zmm28, zmm28, zmm28)
+        vxorpd(zmm8, zmm8, zmm8)
+        vxorpd(zmm9, zmm9, zmm9)
+        vxorpd(zmm29, zmm29, zmm29)
+        vxorpd(zmm10, zmm10, zmm10)
+        vxorpd(zmm11, zmm11, zmm11)
+        vxorpd(zmm26, zmm26, zmm26)
+        vxorpd(zmm12, zmm12, zmm12)
+        vxorpd(zmm13, zmm13, zmm13)
+        vxorpd(zmm27,zmm27, zmm27)
+        vxorpd(zmm14, zmm14, zmm14)
+        vxorpd(zmm15, zmm15, zmm15)
+        vxorpd(zmm24, zmm24, zmm24)
+        vxorpd(zmm16, zmm16, zmm16)
+        vxorpd(zmm17, zmm17, zmm17)
+        vxorpd(zmm25, zmm25, zmm25)
+        vxorpd(zmm18, zmm18, zmm18)
+        vxorpd(zmm19, zmm19, zmm19)
+        vxorpd(zmm22, zmm22, zmm22)
+        vxorpd(zmm20, zmm20, zmm20)
+        vxorpd(zmm21,zmm21, zmm21)
+        vxorpd(zmm23, zmm23, zmm23)
+
+        // K is unrolled by 8 to facilitate prefetch of B
+        // Assuming B to be col-stored, for each iteration of K,
+        //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+        label(.DLOOPKITER)                                     // main loop
+        mov(var(k_iter), rsi)                                  // i = k_iter
+        sub(imm( 8+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+        jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+        label(.LOOP1)
+
+            // ---------------------------------- iteration 1
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 2
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 3
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 4
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 5
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 6
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 7
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 8
+
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP1)                                            // iterate again if i != 0.
+
+        label(.PREFETCHLOOP)
+        add(imm(8), rsi)                                       // i += NR
+        jle(.TAILITER)                                         // jump if i <= 0.
+
+        label(.LOOP2)
+
+            // ---------------------------------- iteration 1
+            prefetchw0( mem(rdx))                              // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 2
+            prefetchw0( mem(rdx, 64))                          // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 3
+            prefetchw0( mem(rdx, 128))                        // prefetch C
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+            lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+            sub(imm(1), rsi)                                   // i -= 1
+        jnz(.LOOP2)                                            // iterate again if i != 0.
+        label(.TAILITER)
+        add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+        jle(.TAIL)                                             // jump if i <= 0
+
+        label(.LOOP3)
+
+            // ---------------------------------- iteration 1
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 2
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 3
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm20 )
+            vfmadd231pd( zmm4,zmm31,zmm21 )
+            vfmadd231pd( zmm5,zmm31,zmm23 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+        label(.TAIL)
+        mov(var(k_left), rsi)                                  // i = k_left
+        test(rsi, rsi)                                         // check i via logical AND
+        je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+        label(.DLOOPKLEFT)                                     // k_left loop
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            vbroadcastsd( mem(r12,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm20 )
+            vfmadd231pd( zmm1,zmm31,zmm21 )
+            vfmadd231pd( zmm2,zmm31,zmm23 )
+            dec(rsi)                                           // i -= 1
+        jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+        label(.DPOSTACCUM)
+        mov(var(alpha), rdx)                                   // load address of alpha
+        vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+        mov(var(beta), rax)                                    // load address of beta
+        vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+        // scale by alpha
+        vmulpd( zmm30,zmm6,zmm6 )
+        vmulpd( zmm30,zmm7,zmm7 )
+        vmulpd( zmm30,zmm28,zmm28 )
+        vmulpd( zmm30,zmm8,zmm8 )
+        vmulpd( zmm30,zmm9,zmm9 )
+        vmulpd( zmm30,zmm29,zmm29 )
+        vmulpd( zmm30,zmm10,zmm10 )
+        vmulpd( zmm30,zmm11,zmm11 )
+        vmulpd( zmm30,zmm26,zmm26 )
+        vmulpd( zmm30,zmm12,zmm12 )
+        vmulpd( zmm30,zmm13,zmm13 )
+        vmulpd( zmm30,zmm27,zmm27 )
+        vmulpd( zmm30,zmm14,zmm14 )
+        vmulpd( zmm30,zmm15,zmm15 )
+        vmulpd( zmm30,zmm24,zmm24 )
+        vmulpd( zmm30,zmm16,zmm16 )
+        vmulpd( zmm30,zmm17,zmm17 )
+        vmulpd( zmm30,zmm25,zmm25 )
+        vmulpd( zmm30,zmm18,zmm18 )
+        vmulpd( zmm30,zmm19,zmm19 )
+        vmulpd( zmm30,zmm22,zmm22 )
+        vmulpd( zmm30,zmm20,zmm20 )
+        vmulpd( zmm30,zmm21,zmm21 )
+        vmulpd( zmm30,zmm23,zmm23 )
+
+
+        mov(var(rs_c), rsi)                                    // load rs_c
+        lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+        lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+        lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+        vxorpd(ymm2, ymm2, ymm2)
+        vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+        je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+        cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+        jz(.DROWSTORED)                                        // jump to row storage case
+
+        label(.DCOLSTORED)
+        vfmadd231pd( mem(rcx),zmm31,zmm6)
+        vmovupd( zmm6,(rcx))
+        vfmadd231pd( 0x40(rcx),zmm31,zmm7)
+        vmovupd( zmm7,0x40(rcx))
+        vfmadd231pd( 0x80(rcx),zmm31,zmm28)
+        vmovupd( zmm28,0x80(rcx))
+        vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8)
+        vmovupd( zmm8,(rcx,rdi,1))
+        vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9)
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29)
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10)
+        vmovupd( zmm10,(rcx,rdi,2))
+        vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11)
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26)
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12)
+        vmovupd( zmm12,(rcx,r13,1))
+        vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13)
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27)
+        vmovupd( zmm27,0x80(rcx,r13,1))
+        vfmadd231pd( mem(rdx),zmm31,zmm14)
+        vmovupd( zmm14,(rdx))
+        vfmadd231pd( 0x40(rdx),zmm31,zmm15)
+        vmovupd( zmm15,0x40(rdx))
+        vfmadd231pd( 0x80(rdx),zmm31,zmm24)
+        vmovupd( zmm24,0x80(rdx))
+        vfmadd231pd( mem(rdx,rdi,1),zmm31,zmm16)
+        vmovupd( zmm16,(rdx,rdi,1))
+        vfmadd231pd( 0x40(rdx,rdi,1),zmm31,zmm17)
+        vmovupd( zmm17,0x40(rdx,rdi,1))
+        vfmadd231pd( 0x80(rdx,rdi,1),zmm31,zmm25)
+        vmovupd( zmm25,0x80(rdx,rdi,1))
+        vfmadd231pd( mem(rdx,rdi,2),zmm31,zmm18)
+        vmovupd( zmm18,(rdx,rdi,2))
+        vfmadd231pd( 0x40(rdx,rdi,2),zmm31,zmm19)
+        vmovupd( zmm19,0x40(rdx,rdi,2))
+        vfmadd231pd( 0x80(rdx,rdi,2),zmm31,zmm22)
+        vmovupd( zmm22,0x80(rdx,rdi,2))
+        vfmadd231pd( mem(rdx,r13,1),zmm31,zmm20)
+        vmovupd( zmm20,(rdx,r13,1))
+        vfmadd231pd( 0x40(rdx,r13,1),zmm31,zmm21)
+        vmovupd( zmm21,0x40(rdx,r13,1))
+        vfmadd231pd( 0x80(rdx,r13,1),zmm31,zmm23)
+        vmovupd( zmm23,0x80(rdx,r13,1))
+
+        jmp(.DDONE)                                           // jump to end.
+
+        label(.DROWSTORED)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_C
+        //First 8x8 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_C
+        //Second 8x8 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        UNPACK_LO_HIGH(25, 24, 0, 1, 23, 22, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_C
+        //Third 8x8 tile updated
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DBETAZERO)
+        cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+        jz(.DROWSTORBZ)                                      // jump to row storage case
+        label(.DCOLSTORBZ)
+        vmovupd( zmm6,(rcx))
+        vmovupd( zmm7,0x40(rcx))
+        vmovupd( zmm28,0x80(rcx))
+        vmovupd( zmm8,(rcx,rdi,1))
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vmovupd( zmm10,(rcx,rdi,2))
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vmovupd( zmm12,(rcx,r13,1))
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vmovupd( zmm27,0x80(rcx,r13,1))
+        vmovupd( zmm14,(rdx))
+        vmovupd( zmm15,0x40(rdx))
+        vmovupd( zmm24,0x80(rdx))
+        vmovupd( zmm16,(rdx,rdi,1))
+        vmovupd( zmm17,0x40(rdx,rdi,1))
+        vmovupd( zmm25,0x80(rdx,rdi,1))
+        vmovupd( zmm18,(rdx,rdi,2))
+        vmovupd( zmm19,0x40(rdx,rdi,2))
+        vmovupd( zmm22,0x80(rdx,rdi,2))
+        vmovupd( zmm20,(rdx,r13,1))
+        vmovupd( zmm21,0x40(rdx,r13,1))
+        vmovupd( zmm23,0x80(rdx,r13,1))
+
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DROWSTORBZ)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        UPDATE_C_BZ
+        //First 8x8 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_C_BZ
+        //Second 8x8 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        UNPACK_LO_HIGH(25, 24, 0, 1, 23, 22, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_C_BZ
+        //Third 8x8 tile updated
+        label(.DDONE)
+
+
+        vzeroupper()
+
+        end_asm(
+          : // output operands (none)
+          : // input operands
+            [k_iter] "m" (k_iter),
+            [k_left] "m" (k_left),
+            [a]      "m" (a),
+            [rs_a]   "m" (rs_a),
+            [cs_a]   "m" (cs_a),
+            [ps_a8]  "m" (ps_a8),
+            [b]      "m" (b),
+            [rs_b]   "m" (rs_b),
+            [cs_b]   "m" (cs_b),
+            [alpha]  "m" (alpha),
+            [beta]   "m" (beta),
+            [c]      "m" (c),
+            [rs_c]   "m" (rs_c),
+            [cs_c]   "m" (cs_c),
+            [n0]     "m" (n0),
+            [m0]     "m" (m0)
+          : // register clobber list
+            "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+            "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+            "xmm2", "xmm31",
+            "ymm2",
+            "zmm0", "zmm1", "zmm2", "zmm3",
+            "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+            "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+            "zmm16", "zmm17", "zmm18", "zmm19",
+            "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+            "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+            "memory"
+        )
+    } //mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t nr_cur = 8;
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+        double *restrict cij = cbuf + i_edge * rs_c;
+        double *restrict ai  = abuf + m_iter * ps_a;
+        double *restrict bj  = bbuf;
+        // covers the range 16 < m_left <= 24 by using masked load/store instructions
+        if( 16 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_24x8(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 8 < m_left <= 16 by using masked load/store instructions
+        else if( 8 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_16x8(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 0 < m_left <= 8 by using masked load/store instructions
+        else if( 0 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_8x8(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+void bli_dgemmsup_rv_zen4_asm_24x7m
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    double *abuf = a;
+    double *bbuf = b;
+    double *cbuf = c;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t m_iter = (uint64_t)m0 / 24;
+    uint64_t m_left = (uint64_t)m0 % 24;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    uint8_t mask = (0xff >> (0x8 - (n0 & 7))); // calculate mask based on n_left
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /* For one iteration of this loop, a block of MRxNR is computed
+     * This loop moves along m-dimension of c matrix with steps of MR*rs_c.
+     */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+
+        a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR)
+        b = bbuf;  //Same KCXNR is used across different MRXKC in MCXKC
+        c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR)
+
+        // -------------------------------------------------------------------------
+        begin_asm()
+
+        mov(var(mask), rdx)             // load mask
+        kmovw(edx, k(2))                // move mask to k2 register
+        mov(var(a), rax)                // load address of a
+        mov(var(cs_a), r10)             // load cs_a
+        mov(var(b), rbx)                // load address of b
+        mov(var(rs_b), r8)              // load rs_b
+        mov(var(cs_b), r9)              // load cs_b
+        mov(var(c), rcx)                // load address of c
+        mov(var(cs_c), rdi)             // load cs_c
+        lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+        lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+        lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+        lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+        lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+        // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+        //is also used to traverse B matrix
+        lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+        lea(mem(rcx, 7*8), rdx)         // C for prefetching
+        mov(var(ps_a8), r14)            // panel stride of A
+        lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A
+        lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+        // if n > 4, a second pointer which point to r11 + 4*cs_b
+        //is also used to prefetch from B matrix
+        lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+        /* Register usage: zmm0-5 are used to load A matrix
+         *                 zmm6-29 are used for accumulation
+         *                 zmm30-31 are used for broadcasting B matrix
+         */
+
+        // zero out all accumulation registers
+        vxorpd(zmm6, zmm6, zmm6)
+        vxorpd(zmm7, zmm7, zmm7)
+        vxorpd(zmm28, zmm28, zmm28)
+        vxorpd(zmm8, zmm8, zmm8)
+        vxorpd(zmm9, zmm9, zmm9)
+        vxorpd(zmm29, zmm29, zmm29)
+        vxorpd(zmm10, zmm10, zmm10)
+        vxorpd(zmm11, zmm11, zmm11)
+        vxorpd(zmm26, zmm26, zmm26)
+        vxorpd(zmm12, zmm12, zmm12)
+        vxorpd(zmm13, zmm13, zmm13)
+        vxorpd(zmm27,zmm27, zmm27)
+        vxorpd(zmm14, zmm14, zmm14)
+        vxorpd(zmm15, zmm15, zmm15)
+        vxorpd(zmm24, zmm24, zmm24)
+        vxorpd(zmm16, zmm16, zmm16)
+        vxorpd(zmm17, zmm17, zmm17)
+        vxorpd(zmm25, zmm25, zmm25)
+        vxorpd(zmm18, zmm18, zmm18)
+        vxorpd(zmm19, zmm19, zmm19)
+        vxorpd(zmm22, zmm22, zmm22)
+
+        // K is unrolled by 8 to facilitate prefetch of B
+        // Assuming B to be col-stored, for each iteration of K,
+        //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+        label(.DLOOPKITER)                                     // main loop
+        mov(var(k_iter), rsi)                                  // i = k_iter
+        sub(imm( 7+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+        jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+        label(.LOOP1)
+
+            // ---------------------------------- iteration 1
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 2
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 3
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 4
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 5
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 6
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 7
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 8
+
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP1)                                            // iterate again if i != 0.
+
+        label(.PREFETCHLOOP)
+        add(imm(7), rsi)                                       // i += NR
+        jle(.TAILITER)                                         // jump if i <= 0.
+
+        label(.LOOP2)
+
+            // ---------------------------------- iteration 1
+            prefetchw0( mem(rdx))                              // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 2
+            prefetchw0( mem(rdx, 64))                          // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 3
+            prefetchw0( mem(rdx, 128))                        // prefetch C
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+            sub(imm(1), rsi)                                   // i -= 1
+        jnz(.LOOP2)                                            // iterate again if i != 0.
+        label(.TAILITER)
+        add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+        jle(.TAIL)                                             // jump if i <= 0
+
+        label(.LOOP3)
+
+            // ---------------------------------- iteration 1
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 2
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 3
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm18 )
+            vfmadd231pd( zmm4,zmm30,zmm19 )
+            vfmadd231pd( zmm5,zmm30,zmm22 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+        label(.TAIL)
+        mov(var(k_left), rsi)                                  // i = k_left
+        test(rsi, rsi)                                         // check i via logical AND
+        je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+        label(.DLOOPKLEFT)                                     // k_left loop
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            vbroadcastsd( mem(r12,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm18 )
+            vfmadd231pd( zmm1,zmm30,zmm19 )
+            vfmadd231pd( zmm2,zmm30,zmm22 )
+            dec(rsi)                                           // i -= 1
+        jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+        label(.DPOSTACCUM)
+        mov(var(alpha), rdx)                                   // load address of alpha
+        vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+        mov(var(beta), rax)                                    // load address of beta
+        vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+        // scale by alpha
+        vmulpd( zmm30,zmm6,zmm6 )
+        vmulpd( zmm30,zmm7,zmm7 )
+        vmulpd( zmm30,zmm28,zmm28 )
+        vmulpd( zmm30,zmm8,zmm8 )
+        vmulpd( zmm30,zmm9,zmm9 )
+        vmulpd( zmm30,zmm29,zmm29 )
+        vmulpd( zmm30,zmm10,zmm10 )
+        vmulpd( zmm30,zmm11,zmm11 )
+        vmulpd( zmm30,zmm26,zmm26 )
+        vmulpd( zmm30,zmm12,zmm12 )
+        vmulpd( zmm30,zmm13,zmm13 )
+        vmulpd( zmm30,zmm27,zmm27 )
+        vmulpd( zmm30,zmm14,zmm14 )
+        vmulpd( zmm30,zmm15,zmm15 )
+        vmulpd( zmm30,zmm24,zmm24 )
+        vmulpd( zmm30,zmm16,zmm16 )
+        vmulpd( zmm30,zmm17,zmm17 )
+        vmulpd( zmm30,zmm25,zmm25 )
+        vmulpd( zmm30,zmm18,zmm18 )
+        vmulpd( zmm30,zmm19,zmm19 )
+        vmulpd( zmm30,zmm22,zmm22 )
+
+
+        mov(var(rs_c), rsi)                                    // load rs_c
+        lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+        lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+        lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+        vxorpd(ymm2, ymm2, ymm2)
+        vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+        je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+        cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+        jz(.DROWSTORED)                                        // jump to row storage case
+
+        label(.DCOLSTORED)
+        vfmadd231pd( mem(rcx),zmm31,zmm6)
+        vmovupd( zmm6,(rcx))
+        vfmadd231pd( 0x40(rcx),zmm31,zmm7)
+        vmovupd( zmm7,0x40(rcx))
+        vfmadd231pd( 0x80(rcx),zmm31,zmm28)
+        vmovupd( zmm28,0x80(rcx))
+        vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8)
+        vmovupd( zmm8,(rcx,rdi,1))
+        vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9)
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29)
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10)
+        vmovupd( zmm10,(rcx,rdi,2))
+        vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11)
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26)
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12)
+        vmovupd( zmm12,(rcx,r13,1))
+        vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13)
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27)
+        vmovupd( zmm27,0x80(rcx,r13,1))
+        vfmadd231pd( mem(rdx),zmm31,zmm14)
+        vmovupd( zmm14,(rdx))
+        vfmadd231pd( 0x40(rdx),zmm31,zmm15)
+        vmovupd( zmm15,0x40(rdx))
+        vfmadd231pd( 0x80(rdx),zmm31,zmm24)
+        vmovupd( zmm24,0x80(rdx))
+        vfmadd231pd( mem(rdx,rdi,1),zmm31,zmm16)
+        vmovupd( zmm16,(rdx,rdi,1))
+        vfmadd231pd( 0x40(rdx,rdi,1),zmm31,zmm17)
+        vmovupd( zmm17,0x40(rdx,rdi,1))
+        vfmadd231pd( 0x80(rdx,rdi,1),zmm31,zmm25)
+        vmovupd( zmm25,0x80(rdx,rdi,1))
+        vfmadd231pd( mem(rdx,rdi,2),zmm31,zmm18)
+        vmovupd( zmm18,(rdx,rdi,2))
+        vfmadd231pd( 0x40(rdx,rdi,2),zmm31,zmm19)
+        vmovupd( zmm19,0x40(rdx,rdi,2))
+        vfmadd231pd( 0x80(rdx,rdi,2),zmm31,zmm22)
+        vmovupd( zmm22,0x80(rdx,rdi,2))
+
+        jmp(.DDONE)                                           // jump to end.
+
+        label(.DROWSTORED)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C
+        //First 8x7 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Second 8x7 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        UNPACK_LO_HIGH(25, 24, 0, 1, 23, 22, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Third 8x7 tile updated
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DBETAZERO)
+        cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+        jz(.DROWSTORBZ)                                      // jump to row storage case
+        label(.DCOLSTORBZ)
+        vmovupd( zmm6,(rcx))
+        vmovupd( zmm7,0x40(rcx))
+        vmovupd( zmm28,0x80(rcx))
+        vmovupd( zmm8,(rcx,rdi,1))
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vmovupd( zmm10,(rcx,rdi,2))
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vmovupd( zmm12,(rcx,r13,1))
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vmovupd( zmm27,0x80(rcx,r13,1))
+        vmovupd( zmm14,(rdx))
+        vmovupd( zmm15,0x40(rdx))
+        vmovupd( zmm24,0x80(rdx))
+        vmovupd( zmm16,(rdx,rdi,1))
+        vmovupd( zmm17,0x40(rdx,rdi,1))
+        vmovupd( zmm25,0x80(rdx,rdi,1))
+        vmovupd( zmm18,(rdx,rdi,2))
+        vmovupd( zmm19,0x40(rdx,rdi,2))
+        vmovupd( zmm22,0x80(rdx,rdi,2))
+
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DROWSTORBZ)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi, 2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi,  8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //First 8x7 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Second 8x7 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        UNPACK_LO_HIGH(25, 24, 0, 1, 23, 22, 2, 3)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Third 8x7 tile updated
+        label(.DDONE)
+
+
+        vzeroupper()
+
+        end_asm(
+          : // output operands (none)
+          : // input operands
+            [k_iter] "m" (k_iter),
+            [k_left] "m" (k_left),
+            [a]      "m" (a),
+            [rs_a]   "m" (rs_a),
+            [cs_a]   "m" (cs_a),
+            [ps_a8]  "m" (ps_a8),
+            [b]      "m" (b),
+            [rs_b]   "m" (rs_b),
+            [cs_b]   "m" (cs_b),
+            [alpha]  "m" (alpha),
+            [beta]   "m" (beta),
+            [c]      "m" (c),
+            [rs_c]   "m" (rs_c),
+            [cs_c]   "m" (cs_c),
+            [n0]     "m" (n0),
+            [m0]     "m" (m0),
+            [mask]   "m" (mask)
+          : // register clobber list
+            "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+            "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+            "xmm2", "xmm31",
+            "ymm2",
+            "zmm0", "zmm1", "zmm2", "zmm3",
+            "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+            "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+            "zmm16", "zmm17", "zmm18", "zmm19",
+            "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+            "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+            "k2", "memory"
+        )
+    } //mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t nr_cur = 7;
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+        double *restrict cij = cbuf + i_edge * rs_c;
+        double *restrict ai  = abuf + m_iter * ps_a;
+        double *restrict bj  = bbuf;
+        // covers the range 16 < m_left <= 24 by using masked load/store instructions
+        if( 16 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_24x7(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 8 < m_left <= 16 by using masked load/store instructions
+        else if( 8 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_16x7(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 0 < m_left <= 8 by using masked load/store instructions
+        else if( 0 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_8x7(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+void bli_dgemmsup_rv_zen4_asm_24x6m
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    double *abuf = a;
+    double *bbuf = b;
+    double *cbuf = c;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t m_iter = (uint64_t)m0 / 24;
+    uint64_t m_left = (uint64_t)m0 % 24;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    uint8_t mask = (0xff >> (0x8 - (n0 & 7))); // calculate mask based on n_left
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /* For one iteration of this loop, a block of MRxNR is computed
+     * This loop moves along m-dimension of c matrix with steps of MR*rs_c.
+     */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+
+        a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR)
+        b = bbuf;  //Same KCXNR is used across different MRXKC in MCXKC
+        c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR)
+
+        // -------------------------------------------------------------------------
+        begin_asm()
+
+        mov(var(mask), rdx)             // load mask
+        kmovw(edx, k(2))                // move mask to k2 register
+        mov(var(a), rax)                // load address of a
+        mov(var(cs_a), r10)             // load cs_a
+        mov(var(b), rbx)                // load address of b
+        mov(var(rs_b), r8)              // load rs_b
+        mov(var(cs_b), r9)              // load cs_b
+        mov(var(c), rcx)                // load address of c
+        mov(var(cs_c), rdi)             // load cs_c
+        lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+        lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+        lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+        lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+        lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+        // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+        //is also used to traverse B matrix
+        lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+        lea(mem(rcx, 7*8), rdx)         // C for prefetching
+        mov(var(ps_a8), r14)            // panel stride of A
+        lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A
+        lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+        // if n > 4, a second pointer which point to r11 + 4*cs_b
+        //is also used to prefetch from B matrix
+        lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+        /* Register usage: zmm0-5 are used to load A matrix
+         *                 zmm6-29 are used for accumulation
+         *                 zmm30-31 are used for broadcasting B matrix
+         */
+
+        // zero out all accumulation registers
+        vxorpd(zmm6, zmm6, zmm6)
+        vxorpd(zmm7, zmm7, zmm7)
+        vxorpd(zmm28, zmm28, zmm28)
+        vxorpd(zmm8, zmm8, zmm8)
+        vxorpd(zmm9, zmm9, zmm9)
+        vxorpd(zmm29, zmm29, zmm29)
+        vxorpd(zmm10, zmm10, zmm10)
+        vxorpd(zmm11, zmm11, zmm11)
+        vxorpd(zmm26, zmm26, zmm26)
+        vxorpd(zmm12, zmm12, zmm12)
+        vxorpd(zmm13, zmm13, zmm13)
+        vxorpd(zmm27,zmm27, zmm27)
+        vxorpd(zmm14, zmm14, zmm14)
+        vxorpd(zmm15, zmm15, zmm15)
+        vxorpd(zmm24, zmm24, zmm24)
+        vxorpd(zmm16, zmm16, zmm16)
+        vxorpd(zmm17, zmm17, zmm17)
+        vxorpd(zmm25, zmm25, zmm25)
+
+        // K is unrolled by 8 to facilitate prefetch of B
+        // Assuming B to be col-stored, for each iteration of K,
+        //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+        label(.DLOOPKITER)                                     // main loop
+        mov(var(k_iter), rsi)                                  // i = k_iter
+        sub(imm( 6+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+        jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+        label(.LOOP1)
+
+            // ---------------------------------- iteration 1
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 2
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 3
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 4
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 5
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 6
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 7
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 8
+
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP1)                                            // iterate again if i != 0.
+
+        label(.PREFETCHLOOP)
+        add(imm(6), rsi)                                       // i += NR
+        jle(.TAILITER)                                         // jump if i <= 0.
+
+        label(.LOOP2)
+
+            // ---------------------------------- iteration 1
+            prefetchw0( mem(rdx))                              // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 2
+            prefetchw0( mem(rdx, 64))                          // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 3
+            prefetchw0( mem(rdx, 128))                        // prefetch C
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+            sub(imm(1), rsi)                                   // i -= 1
+        jnz(.LOOP2)                                            // iterate again if i != 0.
+        label(.TAILITER)
+        add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+        jle(.TAIL)                                             // jump if i <= 0
+
+        label(.LOOP3)
+
+            // ---------------------------------- iteration 1
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 2
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 3
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm16 )
+            vfmadd231pd( zmm4,zmm31,zmm17 )
+            vfmadd231pd( zmm5,zmm31,zmm25 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+        label(.TAIL)
+        mov(var(k_left), rsi)                                  // i = k_left
+        test(rsi, rsi)                                         // check i via logical AND
+        je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+        label(.DLOOPKLEFT)                                     // k_left loop
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            vbroadcastsd( mem(r12,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm16 )
+            vfmadd231pd( zmm1,zmm31,zmm17 )
+            vfmadd231pd( zmm2,zmm31,zmm25 )
+            dec(rsi)                                           // i -= 1
+        jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+        label(.DPOSTACCUM)
+        mov(var(alpha), rdx)                                   // load address of alpha
+        vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+        mov(var(beta), rax)                                    // load address of beta
+        vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+        // scale by alpha
+        vmulpd( zmm30,zmm6,zmm6 )
+        vmulpd( zmm30,zmm7,zmm7 )
+        vmulpd( zmm30,zmm28,zmm28 )
+        vmulpd( zmm30,zmm8,zmm8 )
+        vmulpd( zmm30,zmm9,zmm9 )
+        vmulpd( zmm30,zmm29,zmm29 )
+        vmulpd( zmm30,zmm10,zmm10 )
+        vmulpd( zmm30,zmm11,zmm11 )
+        vmulpd( zmm30,zmm26,zmm26 )
+        vmulpd( zmm30,zmm12,zmm12 )
+        vmulpd( zmm30,zmm13,zmm13 )
+        vmulpd( zmm30,zmm27,zmm27 )
+        vmulpd( zmm30,zmm14,zmm14 )
+        vmulpd( zmm30,zmm15,zmm15 )
+        vmulpd( zmm30,zmm24,zmm24 )
+        vmulpd( zmm30,zmm16,zmm16 )
+        vmulpd( zmm30,zmm17,zmm17 )
+        vmulpd( zmm30,zmm25,zmm25 )
+
+
+        mov(var(rs_c), rsi)                                    // load rs_c
+        lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+        lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+        lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+        vxorpd(ymm2, ymm2, ymm2)
+        vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+        je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+        cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+        jz(.DROWSTORED)                                        // jump to row storage case
+
+        label(.DCOLSTORED)
+        vfmadd231pd( mem(rcx),zmm31,zmm6)
+        vmovupd( zmm6,(rcx))
+        vfmadd231pd( 0x40(rcx),zmm31,zmm7)
+        vmovupd( zmm7,0x40(rcx))
+        vfmadd231pd( 0x80(rcx),zmm31,zmm28)
+        vmovupd( zmm28,0x80(rcx))
+        vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8)
+        vmovupd( zmm8,(rcx,rdi,1))
+        vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9)
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29)
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10)
+        vmovupd( zmm10,(rcx,rdi,2))
+        vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11)
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26)
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12)
+        vmovupd( zmm12,(rcx,r13,1))
+        vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13)
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27)
+        vmovupd( zmm27,0x80(rcx,r13,1))
+        vfmadd231pd( mem(rdx),zmm31,zmm14)
+        vmovupd( zmm14,(rdx))
+        vfmadd231pd( 0x40(rdx),zmm31,zmm15)
+        vmovupd( zmm15,0x40(rdx))
+        vfmadd231pd( 0x80(rdx),zmm31,zmm24)
+        vmovupd( zmm24,0x80(rdx))
+        vfmadd231pd( mem(rdx,rdi,1),zmm31,zmm16)
+        vmovupd( zmm16,(rdx,rdi,1))
+        vfmadd231pd( 0x40(rdx,rdi,1),zmm31,zmm17)
+        vmovupd( zmm17,0x40(rdx,rdi,1))
+        vfmadd231pd( 0x80(rdx,rdi,1),zmm31,zmm25)
+        vmovupd( zmm25,0x80(rdx,rdi,1))
+
+        jmp(.DDONE)                                           // jump to end.
+
+        label(.DROWSTORED)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        vunpcklpd(zmm16, zmm14, zmm0)
+        vunpckhpd(zmm16, zmm14, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C
+        //First 8x6 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        vunpcklpd(zmm17, zmm15, zmm0)
+        vunpckhpd(zmm17, zmm15, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Second 8x6 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        vunpcklpd(zmm25, zmm24, zmm0)
+        vunpckhpd(zmm25, zmm24, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Third 8x6 tile updated
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DBETAZERO)
+        cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+        jz(.DROWSTORBZ)                                      // jump to row storage case
+        label(.DCOLSTORBZ)
+        vmovupd( zmm6,(rcx))
+        vmovupd( zmm7,0x40(rcx))
+        vmovupd( zmm28,0x80(rcx))
+        vmovupd( zmm8,(rcx,rdi,1))
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vmovupd( zmm10,(rcx,rdi,2))
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vmovupd( zmm12,(rcx,r13,1))
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vmovupd( zmm27,0x80(rcx,r13,1))
+        vmovupd( zmm14,(rdx))
+        vmovupd( zmm15,0x40(rdx))
+        vmovupd( zmm24,0x80(rdx))
+        vmovupd( zmm16,(rdx,rdi,1))
+        vmovupd( zmm17,0x40(rdx,rdi,1))
+        vmovupd( zmm25,0x80(rdx,rdi,1))
+
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DROWSTORBZ)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        vunpcklpd(zmm16, zmm14, zmm0)
+        vunpckhpd(zmm16, zmm14, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //First 8x6 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        vunpcklpd(zmm17, zmm15, zmm0)
+        vunpckhpd(zmm17, zmm15, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Second 8x6 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        vunpcklpd(zmm25, zmm24, zmm0)
+        vunpckhpd(zmm25, zmm24, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Third 8x6 tile updated
+        label(.DDONE)
+
+
+        vzeroupper()
+
+        end_asm(
+          : // output operands (none)
+          : // input operands
+            [k_iter] "m" (k_iter),
+            [k_left] "m" (k_left),
+            [a]      "m" (a),
+            [rs_a]   "m" (rs_a),
+            [cs_a]   "m" (cs_a),
+            [ps_a8]  "m" (ps_a8),
+            [b]      "m" (b),
+            [rs_b]   "m" (rs_b),
+            [cs_b]   "m" (cs_b),
+            [alpha]  "m" (alpha),
+            [beta]   "m" (beta),
+            [c]      "m" (c),
+            [rs_c]   "m" (rs_c),
+            [cs_c]   "m" (cs_c),
+            [n0]     "m" (n0),
+            [m0]     "m" (m0),
+            [mask]   "m" (mask)
+          : // register clobber list
+            "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+            "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+            "xmm2", "xmm31",
+            "ymm2",
+            "zmm0", "zmm1", "zmm2", "zmm3",
+            "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+            "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+            "zmm16", "zmm17", "zmm18", "zmm19",
+            "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+            "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+            "k2", "memory"
+        )
+    } //mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t nr_cur = 6;
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+        double *restrict cij = cbuf + i_edge * rs_c;
+        double *restrict ai  = abuf + m_iter * ps_a;
+        double *restrict bj  = bbuf;
+        // covers the range 16 < m_left <= 24 by using masked load/store instructions
+        if( 16 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_24x6(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 8 < m_left <= 16 by using masked load/store instructions
+        else if( 8 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_16x6(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 0 < m_left <= 8 by using masked load/store instructions
+        else if( 0 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_8x6(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+void bli_dgemmsup_rv_zen4_asm_24x5m
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    double *abuf = a;
+    double *bbuf = b;
+    double *cbuf = c;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t m_iter = (uint64_t)m0 / 24;
+    uint64_t m_left = (uint64_t)m0 % 24;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    uint8_t mask = (0xff >> (0x8 - (n0 & 7))); // calculate mask based on n_left
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /* For one iteration of this loop, a block of MRxNR is computed
+     * This loop moves along m-dimension of c matrix with steps of MR*rs_c.
+     */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+
+        a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR)
+        b = bbuf;  //Same KCXNR is used across different MRXKC in MCXKC
+        c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR)
+
+        // -------------------------------------------------------------------------
+        begin_asm()
+
+        mov(var(mask), rdx)             // load mask
+        kmovw(edx, k(2))                // move mask to k2 register
+        mov(var(a), rax)                // load address of a
+        mov(var(cs_a), r10)             // load cs_a
+        mov(var(b), rbx)                // load address of b
+        mov(var(rs_b), r8)              // load rs_b
+        mov(var(cs_b), r9)              // load cs_b
+        mov(var(c), rcx)                // load address of c
+        mov(var(cs_c), rdi)             // load cs_c
+        lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+        lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+        lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+        lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+        lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+        // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+        //is also used to traverse B matrix
+        lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+        lea(mem(rcx, 7*8), rdx)         // C for prefetching
+        mov(var(ps_a8), r14)            // panel stride of A
+        lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A
+        lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+        // if n > 4, a second pointer which point to r11 + 4*cs_b
+        //is also used to prefetch from B matrix
+        lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+        /* Register usage: zmm0-5 are used to load A matrix
+         *                 zmm6-29 are used for accumulation
+         *                 zmm30-31 are used for broadcasting B matrix
+         */
+
+        // zero out all accumulation registers
+        vxorpd(zmm6, zmm6, zmm6)
+        vxorpd(zmm7, zmm7, zmm7)
+        vxorpd(zmm28, zmm28, zmm28)
+        vxorpd(zmm8, zmm8, zmm8)
+        vxorpd(zmm9, zmm9, zmm9)
+        vxorpd(zmm29, zmm29, zmm29)
+        vxorpd(zmm10, zmm10, zmm10)
+        vxorpd(zmm11, zmm11, zmm11)
+        vxorpd(zmm26, zmm26, zmm26)
+        vxorpd(zmm12, zmm12, zmm12)
+        vxorpd(zmm13, zmm13, zmm13)
+        vxorpd(zmm27,zmm27, zmm27)
+        vxorpd(zmm14, zmm14, zmm14)
+        vxorpd(zmm15, zmm15, zmm15)
+        vxorpd(zmm24, zmm24, zmm24)
+
+        // K is unrolled by 8 to facilitate prefetch of B
+        // Assuming B to be col-stored, for each iteration of K,
+        //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+        label(.DLOOPKITER)                                     // main loop
+        mov(var(k_iter), rsi)                                  // i = k_iter
+        sub(imm( 5+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+        jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+        label(.LOOP1)
+
+            // ---------------------------------- iteration 1
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 2
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 3
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 4
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 5
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 6
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 7
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 8
+
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP1)                                            // iterate again if i != 0.
+
+        label(.PREFETCHLOOP)
+        add(imm(5), rsi)                                       // i += NR
+        jle(.TAILITER)                                         // jump if i <= 0.
+
+        label(.LOOP2)
+
+            // ---------------------------------- iteration 1
+            prefetchw0( mem(rdx))                              // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 2
+            prefetchw0( mem(rdx, 64))                          // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 3
+            prefetchw0( mem(rdx, 128))                        // prefetch C
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+            sub(imm(1), rsi)                                   // i -= 1
+        jnz(.LOOP2)                                            // iterate again if i != 0.
+        label(.TAILITER)
+        add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+        jle(.TAIL)                                             // jump if i <= 0
+
+        label(.LOOP3)
+
+            // ---------------------------------- iteration 1
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 2
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 3
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r15) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm14 )
+            vfmadd231pd( zmm4,zmm30,zmm15 )
+            vfmadd231pd( zmm5,zmm30,zmm24 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+        label(.TAIL)
+        mov(var(k_left), rsi)                                  // i = k_left
+        test(rsi, rsi)                                         // check i via logical AND
+        je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+        label(.DLOOPKLEFT)                                     // k_left loop
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            vbroadcastsd( mem(r12),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            add( r8,r12 )                                     // second pointer of b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm14 )
+            vfmadd231pd( zmm1,zmm30,zmm15 )
+            vfmadd231pd( zmm2,zmm30,zmm24 )
+            dec(rsi)                                           // i -= 1
+        jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+        label(.DPOSTACCUM)
+        mov(var(alpha), rdx)                                   // load address of alpha
+        vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+        mov(var(beta), rax)                                    // load address of beta
+        vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+        // scale by alpha
+        vmulpd( zmm30,zmm6,zmm6 )
+        vmulpd( zmm30,zmm7,zmm7 )
+        vmulpd( zmm30,zmm28,zmm28 )
+        vmulpd( zmm30,zmm8,zmm8 )
+        vmulpd( zmm30,zmm9,zmm9 )
+        vmulpd( zmm30,zmm29,zmm29 )
+        vmulpd( zmm30,zmm10,zmm10 )
+        vmulpd( zmm30,zmm11,zmm11 )
+        vmulpd( zmm30,zmm26,zmm26 )
+        vmulpd( zmm30,zmm12,zmm12 )
+        vmulpd( zmm30,zmm13,zmm13 )
+        vmulpd( zmm30,zmm27,zmm27 )
+        vmulpd( zmm30,zmm14,zmm14 )
+        vmulpd( zmm30,zmm15,zmm15 )
+        vmulpd( zmm30,zmm24,zmm24 )
+
+
+        mov(var(rs_c), rsi)                                    // load rs_c
+        lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+        lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+        lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+        vxorpd(ymm2, ymm2, ymm2)
+        vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+        je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+        cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+        jz(.DROWSTORED)                                        // jump to row storage case
+
+        label(.DCOLSTORED)
+        vfmadd231pd( mem(rcx),zmm31,zmm6)
+        vmovupd( zmm6,(rcx))
+        vfmadd231pd( 0x40(rcx),zmm31,zmm7)
+        vmovupd( zmm7,0x40(rcx))
+        vfmadd231pd( 0x80(rcx),zmm31,zmm28)
+        vmovupd( zmm28,0x80(rcx))
+        vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8)
+        vmovupd( zmm8,(rcx,rdi,1))
+        vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9)
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29)
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10)
+        vmovupd( zmm10,(rcx,rdi,2))
+        vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11)
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26)
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12)
+        vmovupd( zmm12,(rcx,r13,1))
+        vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13)
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27)
+        vmovupd( zmm27,0x80(rcx,r13,1))
+        vfmadd231pd( mem(rdx),zmm31,zmm14)
+        vmovupd( zmm14,(rdx))
+        vfmadd231pd( 0x40(rdx),zmm31,zmm15)
+        vmovupd( zmm15,0x40(rdx))
+        vfmadd231pd( 0x80(rdx),zmm31,zmm24)
+        vmovupd( zmm24,0x80(rdx))
+
+        jmp(.DDONE)                                           // jump to end.
+
+        label(.DROWSTORED)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        vunpcklpd(zmm16, zmm14, zmm0)
+        vunpckhpd(zmm16, zmm14, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C
+        //First 8x5 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        vunpcklpd(zmm17, zmm15, zmm0)
+        vunpckhpd(zmm17, zmm15, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Second 8x5 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        vunpcklpd(zmm25, zmm24, zmm0)
+        vunpckhpd(zmm25, zmm24, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Third 8x5 tile updated
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DBETAZERO)
+        cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+        jz(.DROWSTORBZ)                                      // jump to row storage case
+        label(.DCOLSTORBZ)
+        vmovupd( zmm6,(rcx))
+        vmovupd( zmm7,0x40(rcx))
+        vmovupd( zmm28,0x80(rcx))
+        vmovupd( zmm8,(rcx,rdi,1))
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vmovupd( zmm10,(rcx,rdi,2))
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vmovupd( zmm12,(rcx,r13,1))
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vmovupd( zmm27,0x80(rcx,r13,1))
+        vmovupd( zmm14,(rdx))
+        vmovupd( zmm15,0x40(rdx))
+        vmovupd( zmm24,0x80(rdx))
+
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DROWSTORBZ)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        vunpcklpd(zmm16, zmm14, zmm0)
+        vunpckhpd(zmm16, zmm14, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C_BZ
+        //First 8x5 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        vunpcklpd(zmm17, zmm15, zmm0)
+        vunpckhpd(zmm17, zmm15, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Second 8x5 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        vunpcklpd(zmm25, zmm24, zmm0)
+        vunpckhpd(zmm25, zmm24, zmm1)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Third 8x5 tile updated
+        label(.DDONE)
+
+
+        vzeroupper()
+
+        end_asm(
+          : // output operands (none)
+          : // input operands
+            [k_iter] "m" (k_iter),
+            [k_left] "m" (k_left),
+            [a]      "m" (a),
+            [rs_a]   "m" (rs_a),
+            [cs_a]   "m" (cs_a),
+            [ps_a8]  "m" (ps_a8),
+            [b]      "m" (b),
+            [rs_b]   "m" (rs_b),
+            [cs_b]   "m" (cs_b),
+            [alpha]  "m" (alpha),
+            [beta]   "m" (beta),
+            [c]      "m" (c),
+            [rs_c]   "m" (rs_c),
+            [cs_c]   "m" (cs_c),
+            [n0]     "m" (n0),
+            [m0]     "m" (m0),
+            [mask]   "m" (mask)
+          : // register clobber list
+            "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+            "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+            "xmm2", "xmm31",
+            "ymm2",
+            "zmm0", "zmm1", "zmm2", "zmm3",
+            "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+            "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+            "zmm16", "zmm17", "zmm18", "zmm19",
+            "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+            "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+            "k2", "memory"
+        )
+    } //mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t nr_cur = 5;
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+        double *restrict cij = cbuf + i_edge * rs_c;
+        double *restrict ai  = abuf + m_iter * ps_a;
+        double *restrict bj  = bbuf;
+        // covers the range 16 < m_left <= 24 by using masked load/store instructions
+        if( 16 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_24x5(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 8 < m_left <= 16 by using masked load/store instructions
+        else if( 8 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_16x5(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 0 < m_left <= 8 by using masked load/store instructions
+        else if( 0 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_8x5(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+void bli_dgemmsup_rv_zen4_asm_24x4m
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    double *abuf = a;
+    double *bbuf = b;
+    double *cbuf = c;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t m_iter = (uint64_t)m0 / 24;
+    uint64_t m_left = (uint64_t)m0 % 24;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    uint8_t mask = (0xff >> (0x8 - (n0 & 7))); // calculate mask based on n_left
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /* For one iteration of this loop, a block of MRxNR is computed
+     * This loop moves along m-dimension of c matrix with steps of MR*rs_c.
+     */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+
+        a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR)
+        b = bbuf;  //Same KCXNR is used across different MRXKC in MCXKC
+        c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR)
+
+        // -------------------------------------------------------------------------
+        begin_asm()
+
+        mov(var(mask), rdx)             // load mask
+        kmovw(edx, k(2))                // move mask to k2 register
+        mov(var(a), rax)                // load address of a
+        mov(var(cs_a), r10)             // load cs_a
+        mov(var(b), rbx)                // load address of b
+        mov(var(rs_b), r8)              // load rs_b
+        mov(var(cs_b), r9)              // load cs_b
+        mov(var(c), rcx)                // load address of c
+        mov(var(cs_c), rdi)             // load cs_c
+        lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+        lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+        lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+        lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+        lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+        lea(mem(rcx, 7*8), rdx)         // C for prefetching
+        mov(var(ps_a8), r14)            // panel stride of A
+        lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A
+        lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+        /* Register usage: zmm0-5 are used to load A matrix
+         *                 zmm6-29 are used for accumulation
+         *                 zmm30-31 are used for broadcasting B matrix
+         */
+
+        // zero out all accumulation registers
+        vxorpd(zmm6, zmm6, zmm6)
+        vxorpd(zmm7, zmm7, zmm7)
+        vxorpd(zmm28, zmm28, zmm28)
+        vxorpd(zmm8, zmm8, zmm8)
+        vxorpd(zmm9, zmm9, zmm9)
+        vxorpd(zmm29, zmm29, zmm29)
+        vxorpd(zmm10, zmm10, zmm10)
+        vxorpd(zmm11, zmm11, zmm11)
+        vxorpd(zmm26, zmm26, zmm26)
+        vxorpd(zmm12, zmm12, zmm12)
+        vxorpd(zmm13, zmm13, zmm13)
+        vxorpd(zmm27,zmm27, zmm27)
+
+        // K is unrolled by 8 to facilitate prefetch of B
+        // Assuming B to be col-stored, for each iteration of K,
+        //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+        label(.DLOOPKITER)                                     // main loop
+        mov(var(k_iter), rsi)                                  // i = k_iter
+        sub(imm( 4+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+        jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+        label(.LOOP1)
+
+            // ---------------------------------- iteration 1
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 2
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 3
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 4
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 5
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 6
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 7
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 8
+
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP1)                                            // iterate again if i != 0.
+
+        label(.PREFETCHLOOP)
+        add(imm(4), rsi)                                       // i += NR
+        jle(.TAILITER)                                         // jump if i <= 0.
+
+        label(.LOOP2)
+
+            // ---------------------------------- iteration 1
+            prefetchw0( mem(rdx))                              // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 2
+            prefetchw0( mem(rdx, 64))                          // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 3
+            prefetchw0( mem(rdx, 128))                        // prefetch C
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            sub(imm(1), rsi)                                   // i -= 1
+        jnz(.LOOP2)                                            // iterate again if i != 0.
+        label(.TAILITER)
+        add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+        jle(.TAIL)                                             // jump if i <= 0
+
+        label(.LOOP3)
+
+            // ---------------------------------- iteration 1
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 2
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 3
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm12 )
+            vfmadd231pd( zmm4,zmm31,zmm13 )
+            vfmadd231pd( zmm5,zmm31,zmm27 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+        label(.TAIL)
+        mov(var(k_left), rsi)                                  // i = k_left
+        test(rsi, rsi)                                         // check i via logical AND
+        je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+        label(.DLOOPKLEFT)                                     // k_left loop
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            vbroadcastsd( mem(rbx,r13,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm12 )
+            vfmadd231pd( zmm1,zmm31,zmm13 )
+            vfmadd231pd( zmm2,zmm31,zmm27 )
+            dec(rsi)                                           // i -= 1
+        jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+        label(.DPOSTACCUM)
+        mov(var(alpha), rdx)                                   // load address of alpha
+        vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+        mov(var(beta), rax)                                    // load address of beta
+        vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+        // scale by alpha
+        vmulpd( zmm30,zmm6,zmm6 )
+        vmulpd( zmm30,zmm7,zmm7 )
+        vmulpd( zmm30,zmm28,zmm28 )
+        vmulpd( zmm30,zmm8,zmm8 )
+        vmulpd( zmm30,zmm9,zmm9 )
+        vmulpd( zmm30,zmm29,zmm29 )
+        vmulpd( zmm30,zmm10,zmm10 )
+        vmulpd( zmm30,zmm11,zmm11 )
+        vmulpd( zmm30,zmm26,zmm26 )
+        vmulpd( zmm30,zmm12,zmm12 )
+        vmulpd( zmm30,zmm13,zmm13 )
+        vmulpd( zmm30,zmm27,zmm27 )
+
+
+        mov(var(rs_c), rsi)                                    // load rs_c
+        lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+        lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+        vxorpd(ymm2, ymm2, ymm2)
+        vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+        je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+        cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+        jz(.DROWSTORED)                                        // jump to row storage case
+
+        label(.DCOLSTORED)
+        vfmadd231pd( mem(rcx),zmm31,zmm6)
+        vmovupd( zmm6,(rcx))
+        vfmadd231pd( 0x40(rcx),zmm31,zmm7)
+        vmovupd( zmm7,0x40(rcx))
+        vfmadd231pd( 0x80(rcx),zmm31,zmm28)
+        vmovupd( zmm28,0x80(rcx))
+        vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8)
+        vmovupd( zmm8,(rcx,rdi,1))
+        vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9)
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29)
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10)
+        vmovupd( zmm10,(rcx,rdi,2))
+        vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11)
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26)
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vfmadd231pd( mem(rcx,r13,1),zmm31,zmm12)
+        vmovupd( zmm12,(rcx,r13,1))
+        vfmadd231pd( 0x40(rcx,r13,1),zmm31,zmm13)
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vfmadd231pd( 0x80(rcx,r13,1),zmm31,zmm27)
+        vmovupd( zmm27,0x80(rcx,r13,1))
+
+        jmp(.DDONE)                                           // jump to end.
+
+        label(.DROWSTORED)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C
+        //First 8x4 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Second 8x4 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Third 8x4 tile updated
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DBETAZERO)
+        cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+        jz(.DROWSTORBZ)                                      // jump to row storage case
+        label(.DCOLSTORBZ)
+        vmovupd( zmm6,(rcx))
+        vmovupd( zmm7,0x40(rcx))
+        vmovupd( zmm28,0x80(rcx))
+        vmovupd( zmm8,(rcx,rdi,1))
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vmovupd( zmm10,(rcx,rdi,2))
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+        vmovupd( zmm12,(rcx,r13,1))
+        vmovupd( zmm13,0x40(rcx,r13,1))
+        vmovupd( zmm27,0x80(rcx,r13,1))
+
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DROWSTORBZ)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //First 8x5 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Second 8x5 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Third 8x5 tile updated
+        label(.DDONE)
+
+
+        vzeroupper()
+
+        end_asm(
+          : // output operands (none)
+          : // input operands
+            [k_iter] "m" (k_iter),
+            [k_left] "m" (k_left),
+            [a]      "m" (a),
+            [rs_a]   "m" (rs_a),
+            [cs_a]   "m" (cs_a),
+            [ps_a8]  "m" (ps_a8),
+            [b]      "m" (b),
+            [rs_b]   "m" (rs_b),
+            [cs_b]   "m" (cs_b),
+            [alpha]  "m" (alpha),
+            [beta]   "m" (beta),
+            [c]      "m" (c),
+            [rs_c]   "m" (rs_c),
+            [cs_c]   "m" (cs_c),
+            [n0]     "m" (n0),
+            [m0]     "m" (m0),
+            [mask]   "m" (mask)
+          : // register clobber list
+            "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+            "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+            "xmm2", "xmm31",
+            "ymm2",
+            "zmm0", "zmm1", "zmm2", "zmm3",
+            "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+            "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+            "zmm16", "zmm17", "zmm18", "zmm19",
+            "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+            "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+            "k2", "memory"
+        )
+    } //mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t nr_cur = 4;
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+        double *restrict cij = cbuf + i_edge * rs_c;
+        double *restrict ai  = abuf + m_iter * ps_a;
+        double *restrict bj  = bbuf;
+        // covers the range 16 < m_left <= 24 by using masked load/store instructions
+        if( 16 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_24x4(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 8 < m_left <= 16 by using masked load/store instructions
+        else if( 8 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_16x4(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 0 < m_left <= 8 by using masked load/store instructions
+        else if( 0 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_8x4(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+void bli_dgemmsup_rv_zen4_asm_24x3m
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    double *abuf = a;
+    double *bbuf = b;
+    double *cbuf = c;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t m_iter = (uint64_t)m0 / 24;
+    uint64_t m_left = (uint64_t)m0 % 24;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    uint8_t mask = (0xff >> (0x8 - (n0 & 7))); // calculate mask based on n_left
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /* For one iteration of this loop, a block of MRxNR is computed
+     * This loop moves along m-dimension of c matrix with steps of MR*rs_c.
+     */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+
+        a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR)
+        b = bbuf;  //Same KCXNR is used across different MRXKC in MCXKC
+        c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR)
+
+        // -------------------------------------------------------------------------
+        begin_asm()
+
+        mov(var(mask), rdx)             // load mask
+        kmovw(edx, k(2))                // move mask to k2 register
+        mov(var(a), rax)                // load address of a
+        mov(var(cs_a), r10)             // load cs_a
+        mov(var(b), rbx)                // load address of b
+        mov(var(rs_b), r8)              // load rs_b
+        mov(var(cs_b), r9)              // load cs_b
+        mov(var(c), rcx)                // load address of c
+        mov(var(cs_c), rdi)             // load cs_c
+        lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+        lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+        lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+        lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+        lea(mem(rcx, 7*8), rdx)         // C for prefetching
+        mov(var(ps_a8), r14)            // panel stride of A
+        lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A
+        lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+        /* Register usage: zmm0-5 are used to load A matrix
+         *                 zmm6-29 are used for accumulation
+         *                 zmm30-31 are used for broadcasting B matrix
+         */
+
+        // zero out all accumulation registers
+        vxorpd(zmm6, zmm6, zmm6)
+        vxorpd(zmm7, zmm7, zmm7)
+        vxorpd(zmm28, zmm28, zmm28)
+        vxorpd(zmm8, zmm8, zmm8)
+        vxorpd(zmm9, zmm9, zmm9)
+        vxorpd(zmm29, zmm29, zmm29)
+        vxorpd(zmm10, zmm10, zmm10)
+        vxorpd(zmm11, zmm11, zmm11)
+        vxorpd(zmm26, zmm26, zmm26)
+
+        // K is unrolled by 8 to facilitate prefetch of B
+        // Assuming B to be col-stored, for each iteration of K,
+        //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+        label(.DLOOPKITER)                                     // main loop
+        mov(var(k_iter), rsi)                                  // i = k_iter
+        sub(imm( 3+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+        jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+        label(.LOOP1)
+
+            // ---------------------------------- iteration 1
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 2
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 3
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 4
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 5
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 6
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 7
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 8
+
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP1)                                            // iterate again if i != 0.
+
+        label(.PREFETCHLOOP)
+        add(imm(3), rsi)                                       // i += NR
+        jle(.TAILITER)                                         // jump if i <= 0.
+
+        label(.LOOP2)
+
+            // ---------------------------------- iteration 1
+            prefetchw0( mem(rdx))                              // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 2
+            prefetchw0( mem(rdx, 64))                          // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 3
+            prefetchw0( mem(rdx, 128))                        // prefetch C
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            sub(imm(1), rsi)                                   // i -= 1
+        jnz(.LOOP2)                                            // iterate again if i != 0.
+        label(.TAILITER)
+        add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+        jle(.TAIL)                                             // jump if i <= 0
+
+        label(.LOOP3)
+
+            // ---------------------------------- iteration 1
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 2
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 3
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm10 )
+            vfmadd231pd( zmm4,zmm30,zmm11 )
+            vfmadd231pd( zmm5,zmm30,zmm26 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+        label(.TAIL)
+        mov(var(k_left), rsi)                                  // i = k_left
+        test(rsi, rsi)                                         // check i via logical AND
+        je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+        label(.DLOOPKLEFT)                                     // k_left loop
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            vbroadcastsd( mem(rbx,r9,2),zmm30 )
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm10 )
+            vfmadd231pd( zmm1,zmm30,zmm11 )
+            vfmadd231pd( zmm2,zmm30,zmm26 )
+            dec(rsi)                                           // i -= 1
+        jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+        label(.DPOSTACCUM)
+        mov(var(alpha), rdx)                                   // load address of alpha
+        vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+        mov(var(beta), rax)                                    // load address of beta
+        vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+        // scale by alpha
+        vmulpd( zmm30,zmm6,zmm6 )
+        vmulpd( zmm30,zmm7,zmm7 )
+        vmulpd( zmm30,zmm28,zmm28 )
+        vmulpd( zmm30,zmm8,zmm8 )
+        vmulpd( zmm30,zmm9,zmm9 )
+        vmulpd( zmm30,zmm29,zmm29 )
+        vmulpd( zmm30,zmm10,zmm10 )
+        vmulpd( zmm30,zmm11,zmm11 )
+        vmulpd( zmm30,zmm26,zmm26 )
+
+
+        mov(var(rs_c), rsi)                                    // load rs_c
+        lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+        vxorpd(ymm2, ymm2, ymm2)
+        vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+        je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+        cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+        jz(.DROWSTORED)                                        // jump to row storage case
+
+        label(.DCOLSTORED)
+        vfmadd231pd( mem(rcx),zmm31,zmm6)
+        vmovupd( zmm6,(rcx))
+        vfmadd231pd( 0x40(rcx),zmm31,zmm7)
+        vmovupd( zmm7,0x40(rcx))
+        vfmadd231pd( 0x80(rcx),zmm31,zmm28)
+        vmovupd( zmm28,0x80(rcx))
+        vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8)
+        vmovupd( zmm8,(rcx,rdi,1))
+        vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9)
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29)
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vfmadd231pd( mem(rcx,rdi,2),zmm31,zmm10)
+        vmovupd( zmm10,(rcx,rdi,2))
+        vfmadd231pd( 0x40(rcx,rdi,2),zmm31,zmm11)
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vfmadd231pd( 0x80(rcx,rdi,2),zmm31,zmm26)
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+
+        jmp(.DDONE)                                           // jump to end.
+
+        label(.DROWSTORED)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C
+        //First 8x3 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Second 8x3 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Third 8x3 tile updated
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DBETAZERO)
+        cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+        jz(.DROWSTORBZ)                                      // jump to row storage case
+        label(.DCOLSTORBZ)
+        vmovupd( zmm6,(rcx))
+        vmovupd( zmm7,0x40(rcx))
+        vmovupd( zmm28,0x80(rcx))
+        vmovupd( zmm8,(rcx,rdi,1))
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+        vmovupd( zmm10,(rcx,rdi,2))
+        vmovupd( zmm11,0x40(rcx,rdi,2))
+        vmovupd( zmm26,0x80(rcx,rdi,2))
+
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DROWSTORBZ)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C_BZ
+        //First 8x3 tile updated
+
+        UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Second 8x3 tile updated
+
+        UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Third 8x3 tile updated
+        label(.DDONE)
+
+
+        vzeroupper()
+
+        end_asm(
+          : // output operands (none)
+          : // input operands
+            [k_iter] "m" (k_iter),
+            [k_left] "m" (k_left),
+            [a]      "m" (a),
+            [rs_a]   "m" (rs_a),
+            [cs_a]   "m" (cs_a),
+            [ps_a8]  "m" (ps_a8),
+            [b]      "m" (b),
+            [rs_b]   "m" (rs_b),
+            [cs_b]   "m" (cs_b),
+            [alpha]  "m" (alpha),
+            [beta]   "m" (beta),
+            [c]      "m" (c),
+            [rs_c]   "m" (rs_c),
+            [cs_c]   "m" (cs_c),
+            [n0]     "m" (n0),
+            [m0]     "m" (m0),
+            [mask]   "m" (mask)
+          : // register clobber list
+            "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+            "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+            "xmm2", "xmm31",
+            "ymm2",
+            "zmm0", "zmm1", "zmm2", "zmm3",
+            "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+            "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+            "zmm16", "zmm17", "zmm18", "zmm19",
+            "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+            "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+            "k2", "memory"
+        )
+    } //mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t nr_cur = 3;
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+        double *restrict cij = cbuf + i_edge * rs_c;
+        double *restrict ai  = abuf + m_iter * ps_a;
+        double *restrict bj  = bbuf;
+        // covers the range 16 < m_left <= 24 by using masked load/store instructions
+        if( 16 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_24x3(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 8 < m_left <= 16 by using masked load/store instructions
+        else if( 8 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_16x3(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 0 < m_left <= 8 by using masked load/store instructions
+        else if( 0 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_8x3(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+void bli_dgemmsup_rv_zen4_asm_24x2m
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    double *abuf = a;
+    double *bbuf = b;
+    double *cbuf = c;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t m_iter = (uint64_t)m0 / 24;
+    uint64_t m_left = (uint64_t)m0 % 24;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    uint8_t mask = (0xff >> (0x8 - (n0 & 7))); // calculate mask based on n_left
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /* For one iteration of this loop, a block of MRxNR is computed
+     * This loop moves along m-dimension of c matrix with steps of MR*rs_c.
+     */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+
+        a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR)
+        b = bbuf;  //Same KCXNR is used across different MRXKC in MCXKC
+        c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR)
+
+        // -------------------------------------------------------------------------
+        begin_asm()
+
+        mov(var(mask), rdx)             // load mask
+        kmovw(edx, k(2))                // move mask to k2 register
+        mov(var(a), rax)                // load address of a
+        mov(var(cs_a), r10)             // load cs_a
+        mov(var(b), rbx)                // load address of b
+        mov(var(rs_b), r8)              // load rs_b
+        mov(var(cs_b), r9)              // load cs_b
+        mov(var(c), rcx)                // load address of c
+        mov(var(cs_c), rdi)             // load cs_c
+        lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+        lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+        lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+        lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+        lea(mem(rcx, 7*8), rdx)         // C for prefetching
+        mov(var(ps_a8), r14)            // panel stride of A
+        lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A
+        lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+        /* Register usage: zmm0-5 are used to load A matrix
+         *                 zmm6-29 are used for accumulation
+         *                 zmm30-31 are used for broadcasting B matrix
+         */
+
+        // zero out all accumulation registers
+        vxorpd(zmm6, zmm6, zmm6)
+        vxorpd(zmm7, zmm7, zmm7)
+        vxorpd(zmm28, zmm28, zmm28)
+        vxorpd(zmm8, zmm8, zmm8)
+        vxorpd(zmm9, zmm9, zmm9)
+        vxorpd(zmm29, zmm29, zmm29)
+
+        // K is unrolled by 8 to facilitate prefetch of B
+        // Assuming B to be col-stored, for each iteration of K,
+        //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+        label(.DLOOPKITER)                                     // main loop
+        mov(var(k_iter), rsi)                                  // i = k_iter
+        sub(imm( 2+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+        jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+        label(.LOOP1)
+
+            // ---------------------------------- iteration 1
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 2
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 3
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 4
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 5
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 6
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 7
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 8
+
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP1)                                            // iterate again if i != 0.
+
+        label(.PREFETCHLOOP)
+        add(imm(2), rsi)                                       // i += NR
+        jle(.TAILITER)                                         // jump if i <= 0.
+
+        label(.LOOP2)
+
+            // ---------------------------------- iteration 1
+            prefetchw0( mem(rdx))                              // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 2
+            prefetchw0( mem(rdx, 64))                          // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 3
+            prefetchw0( mem(rdx, 128))                        // prefetch C
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            sub(imm(1), rsi)                                   // i -= 1
+        jnz(.LOOP2)                                            // iterate again if i != 0.
+        label(.TAILITER)
+        add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+        jle(.TAIL)                                             // jump if i <= 0
+
+        label(.LOOP3)
+
+            // ---------------------------------- iteration 1
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 2
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 3
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm31,zmm8 )
+            vfmadd231pd( zmm4,zmm31,zmm9 )
+            vfmadd231pd( zmm5,zmm31,zmm29 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+        label(.TAIL)
+        mov(var(k_left), rsi)                                  // i = k_left
+        test(rsi, rsi)                                         // check i via logical AND
+        je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+        label(.DLOOPKLEFT)                                     // k_left loop
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            vbroadcastsd( mem(rbx,r9,1),zmm31 )
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm31,zmm8 )
+            vfmadd231pd( zmm1,zmm31,zmm9 )
+            vfmadd231pd( zmm2,zmm31,zmm29 )
+            dec(rsi)                                           // i -= 1
+        jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+        label(.DPOSTACCUM)
+        mov(var(alpha), rdx)                                   // load address of alpha
+        vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+        mov(var(beta), rax)                                    // load address of beta
+        vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+        // scale by alpha
+        vmulpd( zmm30,zmm6,zmm6 )
+        vmulpd( zmm30,zmm7,zmm7 )
+        vmulpd( zmm30,zmm28,zmm28 )
+        vmulpd( zmm30,zmm8,zmm8 )
+        vmulpd( zmm30,zmm9,zmm9 )
+        vmulpd( zmm30,zmm29,zmm29 )
+
+
+        mov(var(rs_c), rsi)                                    // load rs_c
+        lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+        vxorpd(ymm2, ymm2, ymm2)
+        vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+        je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+        cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+        jz(.DROWSTORED)                                        // jump to row storage case
+
+        label(.DCOLSTORED)
+        vfmadd231pd( mem(rcx),zmm31,zmm6)
+        vmovupd( zmm6,(rcx))
+        vfmadd231pd( 0x40(rcx),zmm31,zmm7)
+        vmovupd( zmm7,0x40(rcx))
+        vfmadd231pd( 0x80(rcx),zmm31,zmm28)
+        vmovupd( zmm28,0x80(rcx))
+        vfmadd231pd( mem(rcx,rdi,1),zmm31,zmm8)
+        vmovupd( zmm8,(rcx,rdi,1))
+        vfmadd231pd( 0x40(rcx,rdi,1),zmm31,zmm9)
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vfmadd231pd( 0x80(rcx,rdi,1),zmm31,zmm29)
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+
+        jmp(.DDONE)                                           // jump to end.
+
+        label(.DROWSTORED)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        vunpcklpd( zmm8,  zmm6,  zmm0)
+        vunpckhpd( zmm8,  zmm6,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C
+        //First 8x2 tile updated
+
+        vunpcklpd( zmm9,  zmm7,  zmm0)
+        vunpckhpd( zmm9,  zmm7,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Second 8x2 tile updated
+
+        vunpcklpd( zmm29,  zmm28,  zmm0)
+        vunpckhpd( zmm29,  zmm28,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Third 8x2 tile updated
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DBETAZERO)
+        cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+        jz(.DROWSTORBZ)                                      // jump to row storage case
+        label(.DCOLSTORBZ)
+        vmovupd( zmm6,(rcx))
+        vmovupd( zmm7,0x40(rcx))
+        vmovupd( zmm28,0x80(rcx))
+        vmovupd( zmm8,(rcx,rdi,1))
+        vmovupd( zmm9,0x40(rcx,rdi,1))
+        vmovupd( zmm29,0x80(rcx,rdi,1))
+
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DROWSTORBZ)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        vunpcklpd( zmm8,  zmm6,  zmm0)
+        vunpckhpd( zmm8,  zmm6,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //First 8x2 tile updated
+
+        vunpcklpd( zmm9,  zmm7,  zmm0)
+        vunpckhpd( zmm9,  zmm7,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Second 8x2 tile updated
+
+        vunpcklpd( zmm29,  zmm28,  zmm0)
+        vunpckhpd( zmm29,  zmm28,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Third 8x2 tile updated
+        label(.DDONE)
+
+
+        vzeroupper()
+
+        end_asm(
+          : // output operands (none)
+          : // input operands
+            [k_iter] "m" (k_iter),
+            [k_left] "m" (k_left),
+            [a]      "m" (a),
+            [rs_a]   "m" (rs_a),
+            [cs_a]   "m" (cs_a),
+            [ps_a8]  "m" (ps_a8),
+            [b]      "m" (b),
+            [rs_b]   "m" (rs_b),
+            [cs_b]   "m" (cs_b),
+            [alpha]  "m" (alpha),
+            [beta]   "m" (beta),
+            [c]      "m" (c),
+            [rs_c]   "m" (rs_c),
+            [cs_c]   "m" (cs_c),
+            [n0]     "m" (n0),
+            [m0]     "m" (m0),
+            [mask]   "m" (mask)
+          : // register clobber list
+            "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+            "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+            "xmm2", "xmm31",
+            "ymm2",
+            "zmm0", "zmm1", "zmm2", "zmm3",
+            "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+            "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+            "zmm16", "zmm17", "zmm18", "zmm19",
+            "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+            "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+            "k2", "memory"
+        )
+    } //mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t nr_cur = 2;
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+        double *restrict cij = cbuf + i_edge * rs_c;
+        double *restrict ai  = abuf + m_iter * ps_a;
+        double *restrict bj  = bbuf;
+        // covers the range 16 < m_left <= 24 by using masked load/store instructions
+        if( 16 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_24x2(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 8 < m_left <= 16 by using masked load/store instructions
+        else if( 8 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_16x2(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 0 < m_left <= 8 by using masked load/store instructions
+        else if( 0 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_8x2(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+void bli_dgemmsup_rv_zen4_asm_24x1m
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    double *abuf = a;
+    double *bbuf = b;
+    double *cbuf = c;
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t m_iter = (uint64_t)m0 / 24;
+    uint64_t m_left = (uint64_t)m0 % 24;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    uint8_t mask = (0xff >> (0x8 - (n0 & 7))); // calculate mask based on n_left
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /* For one iteration of this loop, a block of MRxNR is computed
+     * This loop moves along m-dimension of c matrix with steps of MR*rs_c.
+     */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+
+        a = abuf + m * ps_a ; // Move to next MRXKC in MCXKC (where MC>=MR)
+        b = bbuf;  //Same KCXNR is used across different MRXKC in MCXKC
+        c = cbuf + m * rs_c * 24; // Move to next MRxNR in MCxNR (where MC >= MR)
+
+        // -------------------------------------------------------------------------
+        begin_asm()
+
+        mov(var(mask), rdx)             // load mask
+        kmovw(edx, k(2))                // move mask to k2 register
+        mov(var(a), rax)                // load address of a
+        mov(var(cs_a), r10)             // load cs_a
+        mov(var(b), rbx)                // load address of b
+        mov(var(rs_b), r8)              // load rs_b
+        mov(var(cs_b), r9)              // load cs_b
+        mov(var(c), rcx)                // load address of c
+        mov(var(cs_c), rdi)             // load cs_c
+        lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+        lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+        lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+        lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+        lea(mem(rcx, 7*8), rdx)         // C for prefetching
+        mov(var(ps_a8), r14)            // panel stride of A
+        lea(mem(rax, r14, 1, 7*8), r14) // prefetch next panel of A
+        lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+        /* Register usage: zmm0-5 are used to load A matrix
+         *                 zmm6-29 are used for accumulation
+         *                 zmm30-31 are used for broadcasting B matrix
+         */
+
+        // zero out all accumulation registers
+        vxorpd(zmm6, zmm6, zmm6)
+        vxorpd(zmm7, zmm7, zmm7)
+        vxorpd(zmm28, zmm28, zmm28)
+
+        // K is unrolled by 8 to facilitate prefetch of B
+        // Assuming B to be col-stored, for each iteration of K,
+        //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+        label(.DLOOPKITER)                                     // main loop
+        mov(var(k_iter), rsi)                                  // i = k_iter
+        sub(imm( 1+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+        jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+        label(.LOOP1)
+
+            // ---------------------------------- iteration 1
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 2
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 3
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 4
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 5
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 6
+
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 7
+
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 8
+
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP1)                                            // iterate again if i != 0.
+
+        label(.PREFETCHLOOP)
+        add(imm(1), rsi)                                       // i += NR
+        jle(.TAILITER)                                         // jump if i <= 0.
+
+        label(.LOOP2)
+
+            // ---------------------------------- iteration 1
+            prefetchw0( mem(rdx))                              // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 2
+            prefetchw0( mem(rdx, 64))                          // prefetch C
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 3
+            prefetchw0( mem(rdx, 128))                        // prefetch C
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            sub(imm(1), rsi)                                   // i -= 1
+        jnz(.LOOP2)                                            // iterate again if i != 0.
+        label(.TAILITER)
+        add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+        jle(.TAIL)                                             // jump if i <= 0
+
+        label(.LOOP3)
+
+            // ---------------------------------- iteration 1
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            prefetch( 0,mem(r11) )                             // prefetch B
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 2
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 3
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 4
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 5
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 6
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 7
+            vmovupd( mem(rax),zmm3 )                           // load A
+            vmovupd( 0x40(rax),zmm4 )
+            vmovupd( 0x80(rax),zmm5 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+
+            // ---------------------------------- iteration 8
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm3,zmm30,zmm6 )
+            vfmadd231pd( zmm4,zmm30,zmm7 )
+            vfmadd231pd( zmm5,zmm30,zmm28 )
+            lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+            dec(rsi)                                           // i -= 1
+        jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+        label(.TAIL)
+        mov(var(k_left), rsi)                                  // i = k_left
+        test(rsi, rsi)                                         // check i via logical AND
+        je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+        label(.DLOOPKLEFT)                                     // k_left loop
+            vmovupd( mem(rax),zmm0 )                           // load A
+            vmovupd( 0x40(rax),zmm1 )
+            vmovupd( 0x80(rax),zmm2 )
+            add( r10,rax )                                     // a += cs_a
+            //prefetch 24 elements(3 cachelines) of the corresponding column in next panel of A
+            prefetch( 1,mem(r14) )
+            prefetch( 1,0x40(r14) )
+            prefetch( 1,0x80(r14) )
+            add( r10,r14 )                                     // a_next += cs_a
+            vbroadcastsd( mem(rbx),zmm30 )
+            add( r8,rbx )                                     // b += rs_b
+            vfmadd231pd( zmm0,zmm30,zmm6 )
+            vfmadd231pd( zmm1,zmm30,zmm7 )
+            vfmadd231pd( zmm2,zmm30,zmm28 )
+            dec(rsi)                                           // i -= 1
+        jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+        label(.DPOSTACCUM)
+        mov(var(alpha), rdx)                                   // load address of alpha
+        vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+        mov(var(beta), rax)                                    // load address of beta
+        vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+        // scale by alpha
+        vmulpd( zmm30,zmm6,zmm6 )
+        vmulpd( zmm30,zmm7,zmm7 )
+        vmulpd( zmm30,zmm28,zmm28 )
+
+
+        mov(var(rs_c), rsi)                                    // load rs_c
+        lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+        vxorpd(ymm2, ymm2, ymm2)
+        vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+        je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+        cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+        jz(.DROWSTORED)                                        // jump to row storage case
+
+        label(.DCOLSTORED)
+        vfmadd231pd( mem(rcx),zmm31,zmm6)
+        vmovupd( zmm6,(rcx))
+        vfmadd231pd( 0x40(rcx),zmm31,zmm7)
+        vmovupd( zmm7,0x40(rcx))
+        vfmadd231pd( 0x80(rcx),zmm31,zmm28)
+        vmovupd( zmm28,0x80(rcx))
+
+        jmp(.DDONE)                                           // jump to end.
+
+        label(.DROWSTORED)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        vunpcklpd( zmm8,  zmm6,  zmm0)
+        vunpckhpd( zmm8,  zmm6,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        vbroadcastsd(mem(rax), zmm31)
+        UPDATE_MASKED_C
+        //First 8x1 tile updated
+
+        vunpcklpd( zmm9,  zmm7,  zmm0)
+        vunpckhpd( zmm9,  zmm7,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Second 8x1 tile updated
+
+        vunpcklpd( zmm29,  zmm28,  zmm0)
+        vunpckhpd( zmm29,  zmm28,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C
+        //Third 8x1 tile updated
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DBETAZERO)
+        cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+        jz(.DROWSTORBZ)                                      // jump to row storage case
+        label(.DCOLSTORBZ)
+        vmovupd( zmm6,(rcx))
+        vmovupd( zmm7,0x40(rcx))
+        vmovupd( zmm28,0x80(rcx))
+
+        jmp(.DDONE)                                          // jump to end.
+
+
+        label(.DROWSTORBZ)
+        // r12 = 3*rs_c
+        lea(mem(rsi,  rsi,  2), r12)
+        // r13 = 5*rs_c
+        lea(mem(r12, rsi,  2), r13)
+        // rdx = 7*rs_c
+        lea(mem(r12, rsi,  4), rdx)
+        lea(mem(   , rsi, 8), r14)
+        vunpcklpd( zmm8,  zmm6,  zmm0)
+        vunpckhpd( zmm8,  zmm6,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //First 8x1 tile updated
+
+        vunpcklpd( zmm9,  zmm7,  zmm0)
+        vunpckhpd( zmm9,  zmm7,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Second 8x1 tile updated
+
+        vunpcklpd( zmm29,  zmm28,  zmm0)
+        vunpckhpd( zmm29,  zmm28,  zmm1)
+        SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+        SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+        SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+        SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+        UPDATE_MASKED_C_BZ
+        //Third 8x1 tile updated
+        label(.DDONE)
+
+
+        vzeroupper()
+
+        end_asm(
+          : // output operands (none)
+          : // input operands
+            [k_iter] "m" (k_iter),
+            [k_left] "m" (k_left),
+            [a]      "m" (a),
+            [rs_a]   "m" (rs_a),
+            [cs_a]   "m" (cs_a),
+            [ps_a8]  "m" (ps_a8),
+            [b]      "m" (b),
+            [rs_b]   "m" (rs_b),
+            [cs_b]   "m" (cs_b),
+            [alpha]  "m" (alpha),
+            [beta]   "m" (beta),
+            [c]      "m" (c),
+            [rs_c]   "m" (rs_c),
+            [cs_c]   "m" (cs_c),
+            [n0]     "m" (n0),
+            [m0]     "m" (m0),
+            [mask]   "m" (mask)
+          : // register clobber list
+            "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+            "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+            "xmm2", "xmm31",
+            "ymm2",
+            "zmm0", "zmm1", "zmm2", "zmm3",
+            "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+            "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+            "zmm16", "zmm17", "zmm18", "zmm19",
+            "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+            "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+            "k2", "memory"
+        )
+    } //mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t nr_cur = 1;
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+        double *restrict cij = cbuf + i_edge * rs_c;
+        double *restrict ai  = abuf + m_iter * ps_a;
+        double *restrict bj  = bbuf;
+        // covers the range 16 < m_left <= 24 by using masked load/store instructions
+        if( 16 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_24x1(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 8 < m_left <= 16 by using masked load/store instructions
+        else if( 8 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_16x1(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+        // covers the range 0 < m_left <= 8 by using masked load/store instructions
+        else if( 0 < m_left )
+        {
+            bli_dgemmsup_rv_zen4_asm_8x1(
+              conja, conjb, m_left, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx);
+        }
+    }
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c b/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c
new file mode 100644
index 0000000000..a0db7fd504
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_cv_zen4_z12x4m.c
@@ -0,0 +1,5688 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc.All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define PREFETCH_DIST_C 4
+#define MR 12
+#define NR 4
+
+// Macro for resetting the registers for accumulation
+#define RESET_REGISTERS \
+    VXORPD(ZMM(5), ZMM(5), ZMM(5))      \
+    VXORPD(ZMM(6), ZMM(6), ZMM(6))      \
+    VXORPD(ZMM(7), ZMM(7), ZMM(7))      \
+    VXORPD(ZMM(8), ZMM(8), ZMM(8))      \
+    VXORPD(ZMM(9), ZMM(9), ZMM(9))      \
+    VXORPD(ZMM(10), ZMM(10), ZMM(10))   \
+    VXORPD(ZMM(11), ZMM(11), ZMM(11))   \
+    VXORPD(ZMM(12), ZMM(12), ZMM(12))   \
+    VXORPD(ZMM(13), ZMM(13), ZMM(13))   \
+    VXORPD(ZMM(14), ZMM(14), ZMM(14))   \
+    VXORPD(ZMM(15), ZMM(15), ZMM(15))   \
+    VXORPD(ZMM(16), ZMM(16), ZMM(16))   \
+    VXORPD(ZMM(17), ZMM(17), ZMM(17))   \
+    VXORPD(ZMM(18), ZMM(18), ZMM(18))   \
+    VXORPD(ZMM(19), ZMM(19), ZMM(19))   \
+    VXORPD(ZMM(20), ZMM(20), ZMM(20))   \
+    VXORPD(ZMM(21), ZMM(21), ZMM(21))   \
+    VXORPD(ZMM(22), ZMM(22), ZMM(22))   \
+    VXORPD(ZMM(23), ZMM(23), ZMM(23))   \
+    VXORPD(ZMM(24), ZMM(24), ZMM(24))   \
+    VXORPD(ZMM(25), ZMM(25), ZMM(25))   \
+    VXORPD(ZMM(26), ZMM(26), ZMM(26))   \
+    VXORPD(ZMM(27), ZMM(27), ZMM(27))   \
+    VXORPD(ZMM(28), ZMM(28), ZMM(28))   \
+    VXORPD(ZMM(30), ZMM(30), ZMM(30))   \
+    VXORPD(ZMM(31), ZMM(31), ZMM(31))   \
+
+// Macro to permute in case of 3 loads(12x? cases)
+#define PERMUTE_12Z(R1, R2, R3)  \
+    VPERMILPD(IMM(0x55), ZMM(R1), ZMM(R1))  \
+    VPERMILPD(IMM(0x55), ZMM(R2), ZMM(R2))  \
+    VPERMILPD(IMM(0x55), ZMM(R3), ZMM(R3))  \
+
+// Macro to permute in case of 2 loads(8x? cases)
+#define PERMUTE_8Z(R1, R2)  \
+    VPERMILPD(IMM(0x55), ZMM(R1), ZMM(R1))  \
+    VPERMILPD(IMM(0x55), ZMM(R2), ZMM(R2))  \
+
+// Macro to permute in case of 1 loads(4x? cases)
+#define PERMUTE_4Z(R1)  \
+    VPERMILPD(IMM(0x55), ZMM(R1), ZMM(R1))  \
+
+// Macro to get the PERMUTE_? signature from the list
+#define GET_PERMUTE(_1, _2, _3, NAME, ...)  NAME
+
+// Overloaded macro PERMUTE with variable arguments
+#define PERMUTE(...)  \
+    GET_PERMUTE(__VA_ARGS__,  \
+    PERMUTE_12Z, PERMUTE_8Z, PERMUTE_4Z)(__VA_ARGS__) \
+
+// Macro for fma op in case of 3 loads(12x? cases)
+#define FMA_12Z(B, R1, R2, R3)  \
+    VFMADD231PD(ZMM(0), ZMM(B), ZMM(R1))  \
+    VFMADD231PD(ZMM(1), ZMM(B), ZMM(R2))  \
+    VFMADD231PD(ZMM(2), ZMM(B), ZMM(R3))  \
+
+// Macro for fma op in case of 2 loads(8x? cases)
+#define FMA_8Z(B, R1, R2)  \
+    VFMADD231PD(ZMM(0), ZMM(B), ZMM(R1))  \
+    VFMADD231PD(ZMM(1), ZMM(B), ZMM(R2))  \
+
+// Macro for fma op in case of 1 load(4x? cases)
+#define FMA_4Z(B, R1)  \
+    VFMADD231PD(ZMM(0), ZMM(B), ZMM(R1))  \
+
+// Macro to get the FMA_? signature from the list
+#define GET_FMA(_1, _2, _3, _4, NAME, ...)  NAME
+
+// Overloaded macro FMA with variable arguments
+#define FMA(...)  \
+    GET_FMA(__VA_ARGS__,  \
+    FMA_12Z, FMA_8Z, FMA_4Z)(__VA_ARGS__) \
+
+// Macro for accumalation in case of 3 loads(12x? cases)
+#define ACC_COL_12Z(R1, I1, R2, I2, R3, I3)  \
+    VFMADDSUB231PD(ZMM(R1), ZMM(29), ZMM(I1))  \
+    VFMADDSUB231PD(ZMM(R2), ZMM(29), ZMM(I2))  \
+    VFMADDSUB231PD(ZMM(R3), ZMM(29), ZMM(I3))  \
+
+// Macro for accumalation in case of 2 loads(8x? cases)
+#define ACC_COL_8Z(R1, I1, R2, I2)  \
+    VFMADDSUB231PD(ZMM(R1), ZMM(29), ZMM(I1))  \
+    VFMADDSUB231PD(ZMM(R2), ZMM(29), ZMM(I2))  \
+
+// Macro for accumalation in case of 1 load(4x? cases)
+#define ACC_COL_4Z(R1, I1)  \
+    VFMADDSUB231PD(ZMM(R1), ZMM(29), ZMM(I1))  \
+
+// Macro to get the ACC_COL_? signature from the list
+#define GET_ACC_COL(_1, _2, _3, _4, _5, _6, NAME, ...)  NAME
+
+// Overloaded macro ACC_COL with variable arguments
+#define ACC_COL(...)  \
+    GET_ACC_COL(__VA_ARGS__,  \
+    ACC_COL_12Z, _0, ACC_COL_8Z, _1, ACC_COL_4Z)(__VA_ARGS__) \
+
+// Macro for scaling with alpha if it is complex
+// in case of 3 loads(12x? cases)
+#define ALPHA_GENERIC_12Z(R1, R2, R3) \
+    VMULPD(ZMM(0), ZMM(R1), ZMM(2))  \
+    VMULPD(ZMM(1), ZMM(R1), ZMM(R1))  \
+    VMULPD(ZMM(0), ZMM(R2), ZMM(30))  \
+    VMULPD(ZMM(1), ZMM(R2), ZMM(R2))  \
+    VMULPD(ZMM(0), ZMM(R3), ZMM(31))  \
+    VMULPD(ZMM(1), ZMM(R3), ZMM(R3))  \
+    PERMUTE(R1, R2, R3) \
+    ACC_COL(2, R1, 30, R2, 31, R3)  \
+
+// Macro for scaling with alpha if it is complex
+// in case of 2 loads(8x? cases)
+#define ALPHA_GENERIC_8Z(R1, R2) \
+    VMULPD(ZMM(0), ZMM(R1), ZMM(2))  \
+    VMULPD(ZMM(1), ZMM(R1), ZMM(R1))  \
+    VMULPD(ZMM(0), ZMM(R2), ZMM(30))  \
+    VMULPD(ZMM(1), ZMM(R2), ZMM(R2))  \
+    PERMUTE(R1, R2) \
+    ACC_COL(2, R1, 30, R2)  \
+
+// Macro for scaling with alpha if it is complex
+// in case of 1 load(4x? cases)
+#define ALPHA_GENERIC_4Z(R1) \
+    VMULPD(ZMM(0), ZMM(R1), ZMM(2))  \
+    VMULPD(ZMM(1), ZMM(R1), ZMM(R1))  \
+    PERMUTE(R1) \
+    ACC_COL(2, R1)  \
+
+// Macro to get the ALPHA_GENERIC_? signature from the list
+#define GET_ALPHA_GENERIC(_1, _2, _3, NAME, ...)  NAME
+
+// Overloaded macro ALPHA_GENERIC with variable arguments
+#define ALPHA_GENERIC(...)  \
+    GET_ALPHA_GENERIC(__VA_ARGS__,  \
+    ALPHA_GENERIC_12Z, ALPHA_GENERIC_8Z, ALPHA_GENERIC_4Z)(__VA_ARGS__) \
+
+// Macro for scaling with beta if it is complex
+// in case of 3 loads(12x? cases)
+#define BETA_GENERIC_12Z(C, R1, I1, R2, I2, R3, I3)\
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    VMOVUPD(MEM(C, 64), ZMM(R2)) \
+    VMOVUPD(MEM(C, 128), ZMM(R3))  \
+    \
+    ALPHA_GENERIC(R1, R2, R3) \
+    VADDPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    VADDPD(ZMM(R2), ZMM(I2), ZMM(I2))  \
+    VADDPD(ZMM(R3), ZMM(I3), ZMM(I3))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+    VMOVUPD(ZMM(I2), MEM(C, 64)) \
+    VMOVUPD(ZMM(I3), MEM(C, 128))  \
+
+// Macro for scaling with beta if it is complex
+// in case of 2 loads(8x? cases)
+#define BETA_GENERIC_8Z(C, R1, I1, R2, I2)\
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    VMOVUPD(MEM(C, 64), ZMM(R2)) \
+    \
+    ALPHA_GENERIC(R1, R2) \
+    VADDPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    VADDPD(ZMM(R2), ZMM(I2), ZMM(I2))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+    VMOVUPD(ZMM(I2), MEM(C, 64)) \
+
+// Macro for scaling with beta if it is complex
+// in case of 1 load(4x? cases)
+#define BETA_GENERIC_4Z(C, R1, I1)\
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    \
+    ALPHA_GENERIC(R1) \
+    VADDPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+
+// Macro to get the BETA_GENERIC_? signature from the list
+#define GET_BETA_GENERIC(_1, _2, _3, _4, _5, _6, _7, NAME, ...)  NAME
+
+// Overloaded macro BETA_GENERIC with variable arguments
+#define BETA_GENERIC(...)  \
+    GET_BETA_GENERIC(__VA_ARGS__,  \
+    BETA_GENERIC_12Z, _0, BETA_GENERIC_8Z, _1, BETA_GENERIC_4Z)(__VA_ARGS__) \
+
+#define MICRO_TILE_12x4                             \
+    /* Macro for 12x4 micro-tile evaluation   */    \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) - ZMM(2) */           \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    VMOVUPD(MEM(RAX, 64), ZMM(1))                   \
+    VMOVUPD(MEM(RAX, 128), ZMM(2))                  \
+    LEA(MEM(RBX, R15, 2), R9)                       \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(3, 5, 7, 9)                                 \
+    FMA(4, 6, 8, 10)                                \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(R9), ZMM(3))                   \
+    VBROADCASTSD(MEM(R9, 8), ZMM(4))                \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(30, 11, 13, 15)                             \
+    FMA(31, 12, 14, 16)                             \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(R9, R15, 1), ZMM(30))          \
+    VBROADCASTSD(MEM(R9, R15, 1, 8), ZMM(31))       \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(3, 17, 19, 21)                              \
+    FMA(4, 18, 20, 22)                              \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(30, 23, 25, 27)                             \
+    FMA(31, 24, 26, 28)                             \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_8x4                              \
+    /* Macro for 8x4 micro-tile evaluation   */     \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) - ZMM(1) */           \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    VMOVUPD(MEM(RAX, 64), ZMM(1))                   \
+    LEA(MEM(RBX, R15, 2), R9)                       \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(3, 5, 7)                                    \
+    FMA(4, 6, 8)                                    \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(R9), ZMM(3))                   \
+    VBROADCASTSD(MEM(R9, 8), ZMM(4))                \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(30, 11, 13)                                 \
+    FMA(31, 12, 14)                                 \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(R9, R15, 1), ZMM(30))          \
+    VBROADCASTSD(MEM(R9, R15, 1, 8), ZMM(31))       \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(3, 17, 19)                                  \
+    FMA(4, 18, 20)                                  \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(30, 23, 25)                                 \
+    FMA(31, 24, 26)                                 \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_4x4                              \
+    /* Macro for 4x4 micro-tile evaluation   */     \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) */                    \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    LEA(MEM(RBX, R15, 2), R9)                       \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(3, 5)                                       \
+    FMA(4, 6)                                       \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(R9), ZMM(3))                   \
+    VBROADCASTSD(MEM(R9, 8), ZMM(4))                \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(30, 11)                                     \
+    FMA(31, 12)                                     \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(R9, R15, 1), ZMM(30))          \
+    VBROADCASTSD(MEM(R9, R15, 1, 8), ZMM(31))       \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(3, 17)                                      \
+    FMA(4, 18)                                      \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(30, 23)                                     \
+    FMA(31, 24)                                     \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_12x3                             \
+    /* Macro for 12x3 micro-tile evaluation   */    \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) - ZMM(2) */           \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    VMOVUPD(MEM(RAX, 64), ZMM(1))                   \
+    VMOVUPD(MEM(RAX, 128), ZMM(2))                  \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(3, 5, 7, 9)                                 \
+    FMA(4, 6, 8, 10)                                \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX, R15, 2), ZMM(3))          \
+    VBROADCASTSD(MEM(RBX, R15, 2, 8), ZMM(4))       \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(30, 11, 13, 15)                             \
+    FMA(31, 12, 14, 16)                             \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(3, 17, 19, 21)                              \
+    FMA(4, 18, 20, 22)                              \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_8x3                              \
+    /* Macro for 8x3 micro-tile evaluation   */     \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) - ZMM(1) */           \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    VMOVUPD(MEM(RAX, 64), ZMM(1))                   \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(3, 5, 7)                                    \
+    FMA(4, 6, 8)                                    \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX, R15, 2), ZMM(3))          \
+    VBROADCASTSD(MEM(RBX, R15, 2, 8), ZMM(4))       \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(30, 11, 13)                                 \
+    FMA(31, 12, 14)                                 \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(3, 17, 19)                                  \
+    FMA(4, 18, 20)                                  \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_4x3                              \
+    /* Macro for 4x3 micro-tile evaluation   */     \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) */                    \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(3, 5)                                       \
+    FMA(4, 6)                                       \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX, R15, 2), ZMM(3))          \
+    VBROADCASTSD(MEM(RBX, R15, 2, 8), ZMM(4))       \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(30, 11)                                     \
+    FMA(31, 12)                                     \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(3, 17)                                      \
+    FMA(4, 18)                                      \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_12x2                             \
+    /* Macro for 12x2 micro-tile evaluation   */    \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) - ZMM(2) */           \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    VMOVUPD(MEM(RAX, 64), ZMM(1))                   \
+    VMOVUPD(MEM(RAX, 128), ZMM(2))                  \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(3, 5, 7, 9)                                 \
+    FMA(4, 6, 8, 10)                                \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(30, 11, 13, 15)                             \
+    FMA(31, 12, 14, 16)                             \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_8x2                              \
+    /* Macro for 8x2 micro-tile evaluation   */     \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) - ZMM(1) */           \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    VMOVUPD(MEM(RAX, 64), ZMM(1))                   \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(3, 5, 7)                                    \
+    FMA(4, 6, 8)                                    \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(30, 11, 13)                                 \
+    FMA(31, 12, 14)                                 \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_4x2                              \
+    /* Macro for 4x2 micro-tile evaluation   */     \
+    /* Prebroadcasting B on ZMM(3) and ZMM(4) */    \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) */                    \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    /* Prebroadcasting B on ZMM(30) and ZMM(31) */  \
+    VBROADCASTSD(MEM(RBX, R15, 1), ZMM(30))         \
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), ZMM(31))      \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(3, 5)                                       \
+    FMA(4, 6)                                       \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(30, 11)                                     \
+    FMA(31, 12)                                     \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_12x1                             \
+    /* Macro for 12x1 micro-tile evaluation   */    \
+    /* Broadcasting B on ZMM(3) and ZMM(4) */       \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) - ZMM(2) */           \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    VMOVUPD(MEM(RAX, 64), ZMM(1))                   \
+    VMOVUPD(MEM(RAX, 128), ZMM(2))                  \
+    /* 6 FMAs over 2 broadcasts */                  \
+    FMA(3, 5, 7, 9)                                 \
+    FMA(4, 6, 8, 10)                                \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_8x1                              \
+    /* Macro for 8x1 micro-tile evaluation   */     \
+    /* Broadcasting B on ZMM(3) and ZMM(4) */       \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) - ZMM(1) */           \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    VMOVUPD(MEM(RAX, 64), ZMM(1))                   \
+    /* 4 FMAs over 2 broadcasts */                  \
+    FMA(3, 5, 7)                                    \
+    FMA(4, 6, 8)                                    \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+#define MICRO_TILE_4x1                              \
+    /* Macro for 4x1 micro-tile evaluation   */     \
+    /* Broadcasting B on ZMM(3) and ZMM(4) */       \
+    VBROADCASTSD(MEM(RBX), ZMM(3))                  \
+    VBROADCASTSD(MEM(RBX, 8), ZMM(4))               \
+    /* Loading A using ZMM(0) */                    \
+    VMOVUPD(MEM(RAX), ZMM(0))                       \
+    /* 2 FMAs over 2 broadcasts */                  \
+    FMA(3, 5)                                       \
+    FMA(4, 6)                                       \
+    /* Adjusting addresses for next micro tiles */  \
+    ADD(R14, RBX)                                   \
+    ADD(R13, RAX)                                   \
+
+// Macro for scaling with alpha if it is -1
+// in case of 3 loads(12x? cases)
+#define ALPHA_MINUS_ONE_12Z(R1, R2, R3) \
+    VSUBPD(ZMM(R1), ZMM(2), ZMM(R1)) \
+    VSUBPD(ZMM(R2), ZMM(2), ZMM(R2)) \
+    VSUBPD(ZMM(R3), ZMM(2), ZMM(R3)) \
+
+// Macro for scaling with alpha if it is -1
+// in case of 2 loads(8x? cases)
+#define ALPHA_MINUS_ONE_8Z(R1, R2) \
+    VSUBPD(ZMM(R1), ZMM(2), ZMM(R1)) \
+    VSUBPD(ZMM(R2), ZMM(2), ZMM(R2)) \
+
+// Macro for scaling with alpha if it is -1
+// in case of 1 loads(4x? cases)
+#define ALPHA_MINUS_ONE_4Z(R1) \
+    VSUBPD(ZMM(R1), ZMM(2), ZMM(R1)) \
+
+// Macro to get the ALPHA_MINUS_ONE_? signature from the list
+#define GET_ALPHA_MINUS_ONE(_1, _2, _3, NAME, ...)  NAME
+
+// Overloaded macro ALPHA_MINUS_ONE with variable arguments
+#define ALPHA_MINUS_ONE(...)  \
+    GET_ALPHA_MINUS_ONE(__VA_ARGS__,  \
+    ALPHA_MINUS_ONE_12Z, ALPHA_MINUS_ONE_8Z, ALPHA_MINUS_ONE_4Z)(__VA_ARGS__) \
+
+// Macro for scaling with beta if it is -1
+// in case of 3 loads(12x? cases)
+#define BETA_MINUS_ONE_12Z(C, R1, I1, R2, I2, R3, I3)  \
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    VMOVUPD(MEM(C, 64), ZMM(R2)) \
+    VMOVUPD(MEM(C, 128), ZMM(R3))  \
+    \
+    VSUBPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    VSUBPD(ZMM(R2), ZMM(I2), ZMM(I2))  \
+    VSUBPD(ZMM(R3), ZMM(I3), ZMM(I3))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+    VMOVUPD(ZMM(I2), MEM(C, 64)) \
+    VMOVUPD(ZMM(I3), MEM(C, 128))  \
+
+// Macro for scaling with beta if it is -1
+// in case of 2 loads(8x? cases)
+#define BETA_MINUS_ONE_8Z(C, R1, I1, R2, I2)  \
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    VMOVUPD(MEM(C, 64), ZMM(R2)) \
+    \
+    VSUBPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    VSUBPD(ZMM(R2), ZMM(I2), ZMM(I2))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+    VMOVUPD(ZMM(I2), MEM(C, 64)) \
+
+// Macro for scaling with beta if it is -1
+// in case of 1 load(4x? cases)
+#define BETA_MINUS_ONE_4Z(C, R1, I1)  \
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    \
+    VSUBPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+
+// Macro to get the BETA_MINUS_ONE_? signature from the list
+#define GET_BETA_MINUS_ONE(_1, _2, _3, _4, _5, _6, _7, NAME, ...)  NAME
+
+// Overloaded macro BETA_MINUS_ONE with variable arguments
+#define BETA_MINUS_ONE(...)  \
+    GET_BETA_MINUS_ONE(__VA_ARGS__,  \
+    BETA_MINUS_ONE_12Z, _0, BETA_MINUS_ONE_8Z, _1, BETA_MINUS_ONE_4Z)(__VA_ARGS__) \
+
+// Macro for scaling with beta if it is 1
+// in case of 3 loads(12x? cases)
+#define BETA_ONE_12Z(C, R1, I1, R2, I2, R3, I3)  \
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    VMOVUPD(MEM(C, 64), ZMM(R2)) \
+    VMOVUPD(MEM(C, 128), ZMM(R3))  \
+    \
+    VADDPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    VADDPD(ZMM(R2), ZMM(I2), ZMM(I2))  \
+    VADDPD(ZMM(R3), ZMM(I3), ZMM(I3))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+    VMOVUPD(ZMM(I2), MEM(C, 64)) \
+    VMOVUPD(ZMM(I3), MEM(C, 128))  \
+
+// Macro for scaling with beta if it is 1
+// in case of 2 loads(8x? cases)
+#define BETA_ONE_8Z(C, R1, I1, R2, I2)  \
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    VMOVUPD(MEM(C, 64), ZMM(R2)) \
+    \
+    VADDPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    VADDPD(ZMM(R2), ZMM(I2), ZMM(I2))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+    VMOVUPD(ZMM(I2), MEM(C, 64)) \
+
+// Macro for scaling with beta if it is 1
+// in case of 1 load(4x? cases)
+#define BETA_ONE_4Z(C, R1, I1)  \
+    VMOVUPD(MEM(C), ZMM(R1)) \
+    \
+    VADDPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(C)) \
+
+// Macro to get the BETA_ONE_? signature from the list
+#define GET_BETA_ONE(_1, _2, _3, _4, _5, _6, _7, NAME, ...)  NAME
+
+// Overloaded macro BETA_ONE with variable arguments
+#define BETA_ONE(...)  \
+    GET_BETA_MINUS_ONE(__VA_ARGS__,  \
+    BETA_ONE_12Z, _0, BETA_ONE_8Z, _1, BETA_ONE_4Z)(__VA_ARGS__) \
+
+// Macro for providing in-register transposition of a 4x4 block
+#define TRANSPOSE_4x4(R1, R2, R3, R4) \
+    VSHUFF64X2(IMM(0x88), ZMM(R2), ZMM(R1), ZMM(0)) \
+    VSHUFF64X2(IMM(0x88), ZMM(R4), ZMM(R3), ZMM(2)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(R2), ZMM(R1), ZMM(1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(R4), ZMM(R3), ZMM(3)) \
+    VSHUFF64X2(IMM(0x88), ZMM(2), ZMM(0), ZMM(R1))  \
+    VSHUFF64X2(IMM(0x88), ZMM(3), ZMM(1), ZMM(R2))  \
+    VSHUFF64X2(IMM(0xDD), ZMM(2), ZMM(0), ZMM(R3))  \
+    VSHUFF64X2(IMM(0xDD), ZMM(3), ZMM(1), ZMM(R4))  \
+
+// Macro for providing in-register transposition of a 4x2 block
+#define TRANSPOSE_4x2(R1, R2) \
+    VSHUFF64X2(IMM(0x88), ZMM(R2), ZMM(R1), ZMM(0)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(R2), ZMM(R1), ZMM(1)) \
+    VSHUFF64X2(IMM(0x88), ZMM(1), ZMM(0), ZMM(R1))  \
+    VSHUFF64X2(IMM(0xDD), ZMM(1), ZMM(0), ZMM(R2))  \
+
+// Macro for beta scaling of a 4x4 micro-tile of C when row-stored
+#define BETA_GEN_ROW_4x4(C, R1, I1, R2, I2, R3, I3, R4, I4)  \
+    VMOVUPD(MEM(C), ZMM(R1))    \
+    VMOVUPD(MEM(C, RDI, 1), ZMM(R2))    \
+    LEA(MEM(C, RDI, 2), C)  \
+    VMOVUPD(MEM(C), ZMM(R3))  \
+    VMOVUPD(MEM(C, RDI, 1), ZMM(R4))  \
+    \
+    ALPHA_GENERIC(R1, R2)   \
+    ALPHA_GENERIC(R3, R4)   \
+    \
+    VADDPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    VADDPD(ZMM(R2), ZMM(I2), ZMM(I2))  \
+    VADDPD(ZMM(R3), ZMM(I3), ZMM(I3))  \
+    VADDPD(ZMM(R4), ZMM(I4), ZMM(I4))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(RCX))    \
+    VMOVUPD(ZMM(I2), MEM(RCX, RDI, 1))    \
+    LEA(MEM(RCX, RDI, 2), RCX)  \
+    VMOVUPD(ZMM(I3), MEM(RCX))  \
+    VMOVUPD(ZMM(I4), MEM(RCX, RDI, 1))  \
+
+// Macro for beta scaling of a 4x? micro-tile of C when row-stored, using mask register
+#define BETA_GEN_ROW_MASK(C, R1, I1, R2, I2, R3, I3, R4, I4)  \
+    VMOVUPD(MEM(C), ZMM(R1) MASK_(k(3)))    \
+    VMOVUPD(MEM(C, RDI, 1), ZMM(R2) MASK_(k(3)))    \
+    LEA(MEM(C, RDI, 2), C)  \
+    VMOVUPD(MEM(C), ZMM(R3) MASK_(k(3)))  \
+    VMOVUPD(MEM(C, RDI, 1), ZMM(R4) MASK_(k(3)))  \
+    \
+    ALPHA_GENERIC(R1, R2)   \
+    ALPHA_GENERIC(R3, R4)   \
+    \
+    VADDPD(ZMM(R1), ZMM(I1), ZMM(I1))  \
+    VADDPD(ZMM(R2), ZMM(I2), ZMM(I2))  \
+    VADDPD(ZMM(R3), ZMM(I3), ZMM(I3))  \
+    VADDPD(ZMM(R4), ZMM(I4), ZMM(I4))  \
+    \
+    VMOVUPD(ZMM(I1), MEM(RCX) MASK_(k(3)))    \
+    VMOVUPD(ZMM(I2), MEM(RCX, RDI, 1) MASK_(k(3)))    \
+    LEA(MEM(RCX, RDI, 2), RCX)  \
+    VMOVUPD(ZMM(I3), MEM(RCX) MASK_(k(3)))  \
+    VMOVUPD(ZMM(I4), MEM(RCX, RDI, 1) MASK_(k(3)))  \
+
+// Macro for providing in-register transposition of a 2x2 block
+#define TRANSPOSE_2x2(R1, R2) \
+    VUNPCKLPD(YMM(R2), YMM(R1), YMM(2)) \
+    VUNPCKHPD(YMM(R2), YMM(R1), YMM(3)) \
+    VPERMPD(IMM(0xD8), YMM(2), YMM(2))  \
+    VPERMPD(IMM(0xD8), YMM(3), YMM(3))  \
+    VUNPCKLPD(YMM(3), YMM(2), YMM(R1))  \
+    VUNPCKHPD(YMM(3), YMM(2), YMM(R2))  \
+
+// Macro for beta scaling of a 2x4 micro-tile of C when row-stored
+#define BETA_GEN_ROW_2x4(C, R1, I1, R2, I2) \
+    VMOVUPD(MEM(C), YMM(R1))  \
+    VMOVUPD(MEM(C, RSI, 2), YMM(R2))  \
+    \
+    VMULPD(YMM(0), YMM(R1), YMM(2))  \
+    VMULPD(YMM(1), YMM(R1), YMM(R1)) \
+    VMULPD(YMM(0), YMM(R2), YMM(3))  \
+    VMULPD(YMM(1), YMM(R2), YMM(R2)) \
+    \
+    VPERMILPD(IMM(0x5), YMM(R1), YMM(R1)) \
+    VPERMILPD(IMM(0x5), YMM(R2), YMM(R2)) \
+    \
+    VADDSUBPD(YMM(R1), YMM(2), YMM(R1)) \
+    VADDSUBPD(YMM(R2), YMM(3), YMM(R2)) \
+    \
+    VADDPD(YMM(R1), YMM(I1), YMM(I1))   \
+    VADDPD(YMM(R2), YMM(I2), YMM(I2))   \
+    \
+    VMOVUPD(YMM(I1), MEM(C))  \
+    VMOVUPD(YMM(I2), MEM(C, RSI, 2))  \
+
+// Macro for beta scaling of a 2x3 micro-tile of C when row-stored
+#define BETA_GEN_ROW_2x3(C, R1, I1, R2, I2, R3, I3) \
+    VMOVUPD(MEM(C), YMM(R1))  \
+    VMOVUPD(MEM(C, RSI, 2), XMM(11))  \
+    ADD(RDI, C) \
+    VMOVUPD(MEM(C), YMM(R2))  \
+    VMOVUPD(MEM(C, RSI, 2), XMM(12))  \
+    \
+    VMULPD(YMM(0), YMM(R1), YMM(2))  \
+    VMULPD(YMM(1), YMM(R1), YMM(R1)) \
+    VMULPD(YMM(0), YMM(R2), YMM(3))  \
+    VMULPD(YMM(1), YMM(R2), YMM(R2)) \
+    VMULPD(YMM(0), YMM(11), YMM(13)) \
+    VMULPD(YMM(1), YMM(11), YMM(11)) \
+    VMULPD(YMM(0), YMM(12), YMM(14))  \
+    VMULPD(YMM(1), YMM(12), YMM(12)) \
+    \
+    VPERMILPD(IMM(0x55), YMM(R1), YMM(R1)) \
+    VPERMILPD(IMM(0x55), YMM(R2), YMM(R2)) \
+    VPERMILPD(IMM(0x55), YMM(11), YMM(11)) \
+    VPERMILPD(IMM(0x55), YMM(12), YMM(12)) \
+    \
+    VADDSUBPD(YMM(R1), YMM(2), YMM(R1)) \
+    VADDSUBPD(YMM(R2), YMM(3), YMM(R2)) \
+    VADDSUBPD(YMM(11), YMM(13), YMM(11)) \
+    VADDSUBPD(YMM(12), YMM(14), YMM(12)) \
+    \
+    VEXTRACTF128(IMM(0x1), YMM(I3), XMM(R3))  \
+    \
+    VADDPD(YMM(R1), YMM(I1), YMM(I1))   \
+    VADDPD(YMM(R2), YMM(I2), YMM(I2))   \
+    VADDPD(YMM(11), YMM(I3), YMM(I3))   \
+    VADDPD(YMM(12), YMM(R3), YMM(R3))   \
+    \
+    VMOVUPD(YMM(I1), MEM(RCX))  \
+    VMOVUPD(XMM(I3), MEM(RCX, RSI, 2))  \
+    ADD(RDI, RCX) \
+    VMOVUPD(YMM(I2), MEM(RCX))  \
+    VMOVUPD(XMM(R3), MEM(RCX, RSI, 2))  \
+
+// Macro for beta scaling of a 2x2 micro-tile of C when row-stored
+#define BETA_GEN_ROW_2x2(C, R1, I1, R2, I2) \
+    VMOVUPD(MEM(C), YMM(R1))  \
+    VMOVUPD(MEM(C, RDI, 1), YMM(R2))  \
+    \
+    VMULPD(YMM(0), YMM(R1), YMM(2))  \
+    VMULPD(YMM(1), YMM(R1), YMM(R1)) \
+    VMULPD(YMM(0), YMM(R2), YMM(3))  \
+    VMULPD(YMM(1), YMM(R2), YMM(R2)) \
+    \
+    VPERMILPD(IMM(0x55), YMM(R1), YMM(R1)) \
+    VPERMILPD(IMM(0x55), YMM(R2), YMM(R2)) \
+    \
+    VADDSUBPD(YMM(R1), YMM(2), YMM(R1)) \
+    VADDSUBPD(YMM(R2), YMM(3), YMM(R2)) \
+    \
+    VADDPD(YMM(R1), YMM(I1), YMM(I1))   \
+    VADDPD(YMM(R2), YMM(I2), YMM(I2))   \
+    \
+    VMOVUPD(YMM(I1), MEM(C))  \
+    VMOVUPD(YMM(I2), MEM(C, RDI, 1))  \
+
+// Macro for beta scaling of a 2x1 micro-tile of C when row-stored
+#define BETA_GEN_ROW_2x1(C, R1, I1) \
+    VMOVUPD(MEM(C), XMM(14))  \
+    VMOVUPD(MEM(C, RDI, 1), XMM(15))  \
+    \
+    VMULPD(YMM(0), YMM(14), YMM(2))  \
+    VMULPD(YMM(1), YMM(14), YMM(14)) \
+    VMULPD(YMM(0), YMM(15), YMM(3))  \
+    VMULPD(YMM(1), YMM(15), YMM(15)) \
+    \
+    VPERMILPD(IMM(0x55), YMM(14), YMM(14)) \
+    VPERMILPD(IMM(0x55), YMM(15), YMM(15)) \
+    \
+    VADDSUBPD(YMM(14), YMM(2), YMM(14)) \
+    VADDSUBPD(YMM(15), YMM(3), YMM(15)) \
+    \
+    VADDPD(YMM(14), YMM(R1), YMM(R1))   \
+    VADDPD(YMM(15), YMM(I1), YMM(I1))   \
+    \
+    VMOVUPD(XMM(R1), MEM(C))  \
+    VMOVUPD(XMM(I1), MEM(C, RDI, 1))  \
+
+/*
+   ccc:
+     | | | |         | | | |        | | | |
+     | | | |   +=    | | | | ...    | | | | ...
+     | | | |         | | | |        | | | |
+     | | | |         | | | |        | | | |
+
+   ccr:
+     | | | |        | | | |       --------
+     | | | |   +=   | | | | ...   --------
+     | | | |        | | | |       --------
+     | | | |        | | | |           :
+
+   Assumptions:
+   - A is column stored;
+   - B is row-stored or column-stored;
+   Therefore, this (c)olumn-preferential kernel is well-suited for contiguous
+   (v)ector loads on A and single-element broadcasts from B.
+
+   NOTE: These kernels explicitly support row-oriented IO, implemented
+   via an in-register transpose. And thus they also support the rcc and
+   rcr cases, though only rcc is ever utilized (because rcr is handled by
+   transposing the operation and executing ccr, which does not incur the
+   cost of the in-register transpose).
+
+   rcc:
+     ---------       | | | |      | | | |
+     ---------  +=   | | | | ...  | | | | ...
+     ---------       | | | |      | | | |
+     ---------       | | | |      | | | |
+
+*/
+
+void bli_zgemmsup_cv_zen4_asm_12x4m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t n_left = n0 % NR;
+    // Checking whether this is a edge case in the n dimension.
+    // If so, dispatch other 12x?m kernels, as needed.
+    if ( n_left )
+    {
+      dcomplex*  cij = c;
+      dcomplex*  bj  = b;
+      dcomplex*  ai  = a;
+
+      if ( 3 == n_left )
+      {
+        const dim_t nr_cur = 3;
+        bli_zgemmsup_cv_zen4_asm_12x3m(conja, conjb, m0, nr_cur, k0,
+                                       alpha, ai, rs_a0, cs_a0,
+                                       bj, rs_b0, cs_b0, beta,
+                                       cij, rs_c0, cs_c0,
+                                       data, cntx);
+      }
+
+      if ( 2 == n_left )
+      {
+        const dim_t nr_cur = 2;
+        bli_zgemmsup_cv_zen4_asm_12x2m(conja, conjb, m0, nr_cur, k0,
+                                       alpha, ai, rs_a0, cs_a0,
+                                       bj, rs_b0, cs_b0, beta,
+                                       cij, rs_c0, cs_c0,
+                                       data, cntx);
+      }
+      if ( 1 == n_left )
+      {
+        const dim_t nr_cur = 1;
+        bli_zgemmsup_cv_zen4_asm_12x1m(conja, conjb, m0, nr_cur, k0,
+                                       alpha, ai, rs_a0, cs_a0,
+                                       bj, rs_b0, cs_b0, beta,
+                                       cij, rs_c0, cs_c0,
+                                       data, cntx);
+      }
+      return;
+    }
+    // Main kernel
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    // Obtaining the panel stride for A, In case of packing.
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a16  = ps_a * sizeof( dcomplex );
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+    uint64_t m_iter = m0 / MR; // To be used for MR loop in the kernel
+    uint64_t m_left = m0 % MR; // To be used to dispatch ?x4m kernels
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of alpha and beta scaling
+    // In order to facilitate handling special cases seperately
+    char alpha_mul_type = BLIS_MUL_DEFAULT;
+    char beta_mul_type  = BLIS_MUL_DEFAULT;
+
+    if(alpha->imag == 0.0)// (alpha is real)
+    {
+        if(alpha->real == 1.0)          alpha_mul_type = BLIS_MUL_ONE;
+        else if(alpha->real == -1.0)    alpha_mul_type = BLIS_MUL_MINUS_ONE;
+    }
+
+    if(beta->imag == 0.0)// (beta is real)
+    {
+        if(beta->real == 1.0)       beta_mul_type = BLIS_MUL_ONE;
+        else if(beta->real == -1.0) beta_mul_type = BLIS_MUL_MINUS_ONE;
+        else if(beta->real == 0.0)  beta_mul_type = BLIS_MUL_ZERO;
+    }
+
+    BEGIN_ASM()
+    MOV(VAR(a), R10)          // R10 = base addr of A (MCXKC block)
+    MOV(VAR(b), RDX)          // RDX = base addr of B (KCXNR block)
+    MOV(VAR(c), R12)          // R12 = base addr of C (MCxNR block)
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(VAR(m_iter), R11) // Iterating in steps of MR, until MC(m var)
+    LABEL(.ZMLOOP)
+    MOV(R10, RAX)     // RAX = addr of A for the MRxKC block
+    MOV(RDX, RBX)     // RBX = addr of B for the KCxNR block
+    MOV(R12, RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers for arithmetic and accumulation
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(var(k_iter), R8)
+
+    // Main loop for k
+    /*
+      The implementation facilitates C prefetching(in case of column-storage) onto
+      L1 cache before accessing it. The k-loop is dissected into 3 segments, namely
+      (B)efore (P)refetch, (D)uring (P)refetch and (A)fter (P)refetch. (D)uring (P)refetch
+      segment prefetches C over 4 unrolled units of the 12x4 micro-tile computation in the k-loop.
+      (A)fter (P)refetch segment runs over PREFETCH_DIST urolled units of k-loop.
+    */
+    SUB(IMM(4 + PREFETCH_DIST_C), R8)
+    JLE(.ZK_DP)
+    // Iterations of k(unroll factor = 4) before prefetching
+    LABEL(.ZKITERLOOP_BP)     // K loop (B)efore (P)refetch of C
+
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+
+    DEC(R8)             // k_iter -= 1
+    JNZ(.ZKITERLOOP_BP)
+
+    LABEL(.ZK_DP)       // Prefetching over computation
+    ADD(IMM(4), R8)     // Check if iterations available to prefetch over
+    JLE(.ZK_AP)         // Jump without prefetching if not available
+    MOV(RCX, R9)
+    LABEL(.ZKITERLOOP_DP) // K loop (D)uring (P)refetch of C
+
+    PREFETCH(1, MEM(R9))
+    PREFETCH(1, MEM(R9, 64))
+    PREFETCH(1, MEM(R9, 128))
+
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+
+    ADD(RSI, R9)
+
+    DEC(R8)             // k_iter -= 1
+    JNZ(.ZKITERLOOP_DP)
+
+    LABEL(.ZK_AP)         // Computation after prefetching
+    ADD(IMM(0 + PREFETCH_DIST_C), R8) // Check if enough iterations are available
+    JLE(.ZKLEFT)          // Jump if not available
+    LABEL(.ZKITERLOOP_AP) // K loop (A)fter (P)refetch of C
+
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+    MICRO_TILE_12x4
+
+    DEC(R8)             // k_iter -= 1
+    JNZ(.ZKITERLOOP_AP)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_12x4
+
+    DEC(R8)             // k_left -= 1
+    JNZ(.ZKLEFTLOOP)
+
+    /*
+      ZMM(5), ZMM(7), ... , ZMM(27) contain accumulations due to
+      real components broadcasted from B.
+
+      ZMM(6), ZMM(8), ... , ZMM(28) contain accumulations due to
+      imaginary components broadcasted from B.
+    */
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 12 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6, 8, 10)
+    PERMUTE(12, 14, 16)
+    PERMUTE(18, 20, 22)
+    PERMUTE(24, 26, 28)
+
+    // Final accumulation for A*B on 12 reg using the 24 reg.
+    ACC_COL(5, 6, 7, 8, 9, 10)
+    ACC_COL(11, 12, 13, 14, 15, 16)
+    ACC_COL(17, 18, 19, 20, 21, 22)
+    ACC_COL(23, 24, 25, 26, 27, 28)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12  ZMM18  ZMM24
+      ZMM8  ZMM14  ZMM20  ZMM26
+      ZMM10 ZMM16  ZMM22  ZMM28
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha_mul_type), AL)
+    CMP(IMM(0xFF), AL) // Checking if alpha == -1
+    JNE(.ALPHA_GENERAL)
+    // Handling when alpha == -1
+    VXORPD(ZMM(2), ZMM(2), ZMM(2)) // Resetting ZMM(2) to 0
+
+    // Subtracting C from alpha*A*B, one column at a time
+    ALPHA_MINUS_ONE(6, 8, 10)
+    ALPHA_MINUS_ONE(12, 14, 16)
+    ALPHA_MINUS_ONE(18, 20, 22)
+    ALPHA_MINUS_ONE(24, 26, 28)
+    JMP(.BETA_SCALE)
+
+    LABEL(.ALPHA_GENERAL)
+    CMP(IMM(2), AL) // Checking if alpha == BLIS_MUL_DEFAULT
+    JNE(.BETA_SCALE)
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6, 8, 10)
+    ALPHA_GENERIC(12, 14, 16)
+    ALPHA_GENERIC(18, 20, 22)
+    ALPHA_GENERIC(24, 26, 28)
+
+    // Beta scaling
+    /*
+      The final result of the GEMM operation is obtained in 2 steps:
+      1. Loading C and beta scaling over loaded registers.
+      2. Adding with registers containing alpha*A*B
+
+      ZMM(5), ZMM(7), ... , ZMM(27) are used for implementing the first step.
+      Final result of the GEMM operation is accumalated over ZMM(6), ZMM(8), ... , ZMM(28).
+    */
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+    CMP(IMM(0x01), AL) // Checking if beta == 1
+    JE(.ADD)
+    CMP(IMM(0xFF), AL) // Checking if beta == -1
+    JNE(.BETA_GENERAL)
+
+    // Subtracting C from alpha*A*B, one column at a time
+    BETA_MINUS_ONE(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_MINUS_ONE(RCX, 11, 12, 13, 14, 15, 16)
+    ADD(RSI, RCX)
+    BETA_MINUS_ONE(RCX, 17, 18, 19, 20, 21, 22)
+    ADD(RSI, RCX)
+    BETA_MINUS_ONE(RCX, 23, 24, 25, 26, 27, 28)
+    JMP(.END)
+
+    LABEL(.BETA_GENERAL) // Checking if beta == BLIS_MUL_DEFAULT
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12, 13, 14, 15, 16)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 17, 18, 19, 20, 21, 22)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 23, 24, 25, 26, 27, 28)
+    JMP(.END)
+
+    // Handling when beta == 1
+    LABEL(.ADD)
+    // Adding C to alpha*A*B, one column at a time
+    BETA_ONE(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_ONE(RCX, 11, 12, 13, 14, 15, 16)
+    ADD(RSI, RCX)
+    BETA_ONE(RCX, 17, 18, 19, 20, 21, 22)
+    ADD(RSI, RCX)
+    BETA_ONE(RCX, 23, 24, 25, 26, 27, 28)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    LEA(MEM(RCX, RSI, 2), R9)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(8), MEM(RCX, 64))
+    VMOVUPD(ZMM(10), MEM(RCX, 128))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+    VMOVUPD(ZMM(14), MEM(RCX, RSI, 1, 64))
+    VMOVUPD(ZMM(16), MEM(RCX, RSI, 1, 128))
+
+    VMOVUPD(ZMM(18), MEM(R9))
+    VMOVUPD(ZMM(20), MEM(R9, 64))
+    VMOVUPD(ZMM(22), MEM(R9, 128))
+
+    VMOVUPD(ZMM(24), MEM(R9, RSI, 1))
+    VMOVUPD(ZMM(26), MEM(R9, RSI, 1, 64))
+    VMOVUPD(ZMM(28), MEM(R9, RSI, 1, 128))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    TRANSPOSE_4x4(8, 14, 20, 26)
+    TRANSPOSE_4x4(10, 16, 22, 28)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+
+      ZMM8
+      ZMM14
+      ZMM20
+      ZMM26
+
+      ZMM10
+      ZMM16
+      ZMM22
+      ZMM28
+    */
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Handling when beta != 0
+    BETA_GEN_ROW_4x4(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_4x4(R9, 7, 8, 13, 14, 19, 20, 25, 26)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_4x4(R9, 9, 10, 15, 16, 21, 22, 27, 28)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    LEA(MEM(R9, RDI, 1), R9)          // R9 = RCX + 3*rs_c
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2))
+    VMOVUPD(ZMM(8), MEM(RCX, RDI, 4))
+    VMOVUPD(ZMM(10), MEM(RCX, RDI, 8))
+
+    LEA(MEM(RCX, RDI, 4), RCX)
+    LEA(MEM(RCX, RDI, 2), RCX)        // RCX = RCX + 6*rs_c
+    VMOVUPD(ZMM(24), MEM(R9))
+    VMOVUPD(ZMM(14), MEM(R9, RDI, 2))
+    VMOVUPD(ZMM(26), MEM(R9, RDI, 4))
+    VMOVUPD(ZMM(28), MEM(R9, RDI, 8))
+
+    LEA(MEM(R9, RDI, 4), R9)
+    LEA(MEM(R9, RDI, 2), R9)          // R9 = RCX + 9*rs_c
+    VMOVUPD(ZMM(20), MEM(RCX))
+    VMOVUPD(ZMM(22), MEM(RCX, RDI, 4))
+
+    VMOVUPD(ZMM(16), MEM(R9))
+
+    LABEL(.END)
+    /*
+      Adjusting the addresses for loading the
+      next micro panel from A and the next micro
+      tile from C.
+    */
+    MOV(VAR(ps_a16), RBX)
+    ADD(RBX, R10)
+    LEA(MEM(R12, RDI, 8), R12)
+    LEA(MEM(R12, RDI, 4), R12)
+
+    DEC(R11)
+    JNE(.ZMLOOP)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [m_iter]  "m" (m_iter),
+      [m_left]  "m" (m_left),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha_mul_type]  "m" (alpha_mul_type),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [ps_a16]   "m" (ps_a16),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:;
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+      const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+      dcomplex* restrict cij = c + i_edge * rs_c;
+      dcomplex* restrict ai  = a + m_iter * ps_a;
+      dcomplex* restrict bj  = b;
+
+      if (8 <= m_left)
+      {
+        const dim_t      mr_cur = 8;
+        bli_zgemmsup_cv_zen4_asm_8x4(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if (4 <= m_left)
+      {
+        const dim_t      mr_cur = 4;
+        bli_zgemmsup_cv_zen4_asm_4x4(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if (2 <= m_left)
+      {
+        const dim_t      mr_cur = 2;
+        bli_zgemmsup_cv_zen4_asm_2x4(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if ( 1 == m_left )
+      {
+        bli_zgemv_ex
+        (
+          BLIS_TRANSPOSE, conja, k0, n0,
+          alpha, bj, rs_b0, cs_b0, ai, cs_a0,
+          beta, cij, cs_c0, cntx, NULL
+        );
+      }
+    }
+}
+
+void bli_zgemmsup_cv_zen4_asm_12x3m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // This kernel is invoked at the beginning of 12x4m
+    // In case of n_left == 3
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    // Obtaining the panel stride for A, In case of packing.
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a16  = ps_a * sizeof( dcomplex );
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+    uint64_t m_iter = m0 / MR; // To be used for MR loop in the kernel
+    uint64_t m_left = m0 % MR;
+
+    /*
+      The mask bits below are set for ensuring ?x3 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint8_t trans_load_mask = 0x3F; // Mask for transposing and loading = 0b 00 11 11 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 16 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of alpha and beta scaling
+    // In order to facilitate handling special cases seperately
+    char alpha_mul_type = BLIS_MUL_DEFAULT;
+    char beta_mul_type  = BLIS_MUL_DEFAULT;
+
+    if(alpha->imag == 0.0)// (alpha is real)
+    {
+        if(alpha->real == 1.0)          alpha_mul_type = BLIS_MUL_ONE;
+        else if(alpha->real == -1.0)    alpha_mul_type = BLIS_MUL_MINUS_ONE;
+    }
+
+    if(beta->imag == 0.0)// (beta is real)
+    {
+        if(beta->real == 1.0)       beta_mul_type = BLIS_MUL_ONE;
+        else if(beta->real == -1.0) beta_mul_type = BLIS_MUL_MINUS_ONE;
+        else if(beta->real == 0.0)  beta_mul_type = BLIS_MUL_ZERO;
+    }
+
+    BEGIN_ASM()
+    MOV(VAR(a), R10)          // R10 = base addr of A (MCXKC block)
+    MOV(VAR(b), RDX)
+    MOV(VAR(c), R12)          // R12 = base addr of C (MCxNR block)
+
+    MOV(VAR(ps_a16), R11)
+    LEA(MEM(, R11, 8), R11)
+    LEA(MEM(, R11, 2), R11)   // R11 = sizeof(dcomplex)*ps_a16
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), EAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(VAR(m_iter), R11) // Iterating in steps of MR, until MC(m var)
+    LABEL(.ZMLOOP)
+    MOV(R10, RAX)     // RAX = addr of A for the MRxKC block
+    MOV(RDX, RBX)     // RBX = addr of B for the KCxNR block
+    MOV(R12, RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(var(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    // Main loop for k
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_12x3
+    MICRO_TILE_12x3
+    MICRO_TILE_12x3
+    MICRO_TILE_12x3
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_12x3
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 9 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6, 8, 10)
+    PERMUTE(12, 14, 16)
+    PERMUTE(18, 20, 22)
+
+    // Final accumulation for A*B on 9 reg using the 24 reg.
+    ACC_COL(5, 6, 7, 8, 9, 10)
+    ACC_COL(11, 12, 13, 14, 15, 16)
+    ACC_COL(17, 18, 19, 20, 21, 22)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12  ZMM18
+      ZMM8  ZMM14  ZMM20
+      ZMM10 ZMM16  ZMM22
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha_mul_type), AL)
+    CMP(IMM(0xFF), AL) // Checking if alpha == -1
+    JNE(.ALPHA_GENERAL)
+    // Handling when alpha == -1
+    VXORPD(ZMM(2), ZMM(2), ZMM(2)) // Resetting ZMM(2) to 0
+
+    // Subtracting C from alpha*A*B, one column at a time
+    ALPHA_MINUS_ONE(6, 8, 10)
+    ALPHA_MINUS_ONE(12, 14, 16)
+    ALPHA_MINUS_ONE(18, 20, 22)
+    JMP(.BETA_SCALE)
+
+    LABEL(.ALPHA_GENERAL)
+    CMP(IMM(2), AL) // Checking if alpha == BLIS_MUL_DEFAULT
+    JNE(.BETA_SCALE)
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6, 8, 10)
+    ALPHA_GENERIC(12, 14, 16)
+    ALPHA_GENERIC(18, 20, 22)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+    CMP(IMM(0x01), AL) // Checking if beta == 1
+    JE(.ADD)
+    CMP(IMM(0xFF), AL) // Checking if beta == -1
+    JNE(.BETA_GENERAL)
+
+    // Subtracting C from alpha*A*B, one column at a time
+    BETA_MINUS_ONE(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_MINUS_ONE(RCX, 11, 12, 13, 14, 15, 16)
+    ADD(RSI, RCX)
+    BETA_MINUS_ONE(RCX, 17, 18, 19, 20, 21, 22)
+    JMP(.END)
+
+    LABEL(.BETA_GENERAL) // Checking if beta == BLIS_MUL_DEFAULT
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12, 13, 14, 15, 16)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 17, 18, 19, 20, 21, 22)
+    JMP(.END)
+
+    // Handling when beta == 1
+    LABEL(.ADD)
+    // Adding C to alpha*A*B, one column at a time
+    BETA_ONE(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_ONE(RCX, 11, 12, 13, 14, 15, 16)
+    ADD(RSI, RCX)
+    BETA_ONE(RCX, 17, 18, 19, 20, 21, 22)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(8), MEM(RCX, 64))
+    VMOVUPD(ZMM(10), MEM(RCX, 128))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+    VMOVUPD(ZMM(14), MEM(RCX, RSI, 1, 64))
+    VMOVUPD(ZMM(16), MEM(RCX, RSI, 1, 128))
+
+    VMOVUPD(ZMM(18), MEM(RCX, RSI, 2))
+    VMOVUPD(ZMM(20), MEM(RCX, RSI, 2, 64))
+    VMOVUPD(ZMM(22), MEM(RCX, RSI, 2, 128))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    TRANSPOSE_4x4(8, 14, 20, 26)
+    TRANSPOSE_4x4(10, 16, 22, 28)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+
+      ZMM8
+      ZMM14
+      ZMM20
+      ZMM26
+
+      ZMM10
+      ZMM16
+      ZMM22
+      ZMM28
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 7, 8, 13, 14, 19, 20, 25, 26)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 9, 10, 15, 16, 21, 22, 27, 28)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    LEA(MEM(R9, RDI, 1), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(8), MEM(RCX, RDI, 4) MASK_(k(3)))
+    VMOVUPD(ZMM(10), MEM(RCX, RDI, 8) MASK_(k(3)))
+
+    LEA(MEM(RCX, RDI, 4), RCX)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    VMOVUPD(ZMM(24), MEM(R9) MASK_(k(3)))
+    VMOVUPD(ZMM(14), MEM(R9, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(26), MEM(R9, RDI, 4) MASK_(k(3)))
+    VMOVUPD(ZMM(28), MEM(R9, RDI, 8) MASK_(k(3)))
+
+    LEA(MEM(R9, RDI, 4), R9)
+    LEA(MEM(R9, RDI, 2), R9)
+    VMOVUPD(ZMM(20), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(22), MEM(RCX, RDI, 4) MASK_(k(3)))
+
+    VMOVUPD(ZMM(16), MEM(R9) MASK_(k(3)))
+
+    LABEL(.END)
+    /*
+      Adjusting the addresses for loading the
+      next micro panel from A and the next micro
+      tile from C.
+    */
+    MOV(VAR(ps_a16), RBX)
+    ADD(RBX, R10)
+    LEA(MEM(R12, RDI, 8), R12)
+    LEA(MEM(R12, RDI, 4), R12)
+
+    DEC(R11)
+    JNE(.ZMLOOP)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [m_iter]  "m" (m_iter),
+      [m_left]  "m" (m_left),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [trans_load_mask] "m" (trans_load_mask),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [alpha_mul_type]   "m" (alpha_mul_type),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [ps_a16]   "m" (ps_a16),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+      const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+      dcomplex* restrict cij = c + i_edge * rs_c;
+      dcomplex* restrict ai  = a + m_iter * ps_a;
+      dcomplex* restrict bj  = b;
+
+      if (8 <= m_left)
+      {
+        const dim_t      mr_cur = 8;
+        bli_zgemmsup_cv_zen4_asm_8x3(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if (4 <= m_left)
+      {
+        const dim_t      mr_cur = 4;
+        bli_zgemmsup_cv_zen4_asm_4x3(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if (2 <= m_left)
+      {
+        const dim_t      mr_cur = 2;
+        bli_zgemmsup_cv_zen4_asm_2x3(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if ( 1 == m_left )
+      {
+        bli_zgemv_ex
+        (
+          BLIS_TRANSPOSE, conja, k0, n0,
+          alpha, bj, rs_b0, cs_b0, ai, cs_a0,
+          beta, cij, cs_c0, cntx, NULL
+        );
+      }
+    }
+}
+
+void bli_zgemmsup_cv_zen4_asm_12x2m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // This kernel is invoked at the beginning of 12x4m
+    // In case of n_left == 2
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    // Obtaining the panel stride for A, In case of packing.
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a16  = ps_a * sizeof( dcomplex );
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+    uint64_t m_iter = m0 / MR; // To be used for MR loop in the kernel
+    uint64_t m_left = m0 % MR;
+
+    /*
+      The mask bits below are set for ensuring ?x2 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint64_t trans_load_mask = 0x0F; // mask for transposing and loading = 0b 00 00 11 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 32 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of alpha and beta scaling
+    // In order to facilitate handling special cases seperately
+    char alpha_mul_type = BLIS_MUL_DEFAULT;
+    char beta_mul_type  = BLIS_MUL_DEFAULT;
+
+    if(alpha->imag == 0.0)// (alpha is real)
+    {
+        if(alpha->real == 1.0)          alpha_mul_type = BLIS_MUL_ONE;
+        else if(alpha->real == -1.0)    alpha_mul_type = BLIS_MUL_MINUS_ONE;
+    }
+
+    if(beta->imag == 0.0)// (beta is real)
+    {
+        if(beta->real == 1.0)       beta_mul_type = BLIS_MUL_ONE;
+        else if(beta->real == -1.0) beta_mul_type = BLIS_MUL_MINUS_ONE;
+        else if(beta->real == 0.0)  beta_mul_type = BLIS_MUL_ZERO;
+    }
+
+    BEGIN_ASM()
+    MOV(VAR(a), R10)          // R10 = base addr of A (MCXKC block)
+    MOV(VAR(b), RDX)          // RDX = base addr of B (KCXNR block)
+    MOV(VAR(c), R12)          // R12 = base addr of C (MCxNR block)
+
+    MOV(VAR(ps_a16), R11)
+    LEA(MEM(, R11, 8), R11)
+    LEA(MEM(, R11, 2), R11)   // R11 = sizeof(dcomplex)*ps_a16
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), EAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(VAR(m_iter), R11) // Iterating in steps of MR, until MC(m var)
+    LABEL(.ZMLOOP)
+    MOV(R10, RAX)     // RAX = addr of A for the MRxKC block
+    MOV(RDX, RBX)     // RBX = addr of B for the KCxNR block
+    MOV(R12, RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(var(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    // Main loop for k
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_12x2
+    MICRO_TILE_12x2
+    MICRO_TILE_12x2
+    MICRO_TILE_12x2
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_12x2
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 6 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6, 8, 10)
+    PERMUTE(12, 14, 16)
+
+    // Final accumulation for A*B on 6 reg using the 12 reg.
+    ACC_COL(5, 6, 7, 8, 9, 10)
+    ACC_COL(11, 12, 13, 14, 15, 16)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12
+      ZMM8  ZMM14
+      ZMM10 ZMM16
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha_mul_type), AL)
+    CMP(IMM(0xFF), AL) // Checking if alpha == -1
+    JNE(.ALPHA_GENERAL)
+    // Handling when alpha == -1
+    VXORPD(ZMM(2), ZMM(2), ZMM(2)) // Resetting ZMM(2) to 0
+
+    // Subtracting C from alpha*A*B, one column at a time
+    ALPHA_MINUS_ONE(6, 8, 10)
+    ALPHA_MINUS_ONE(12, 14, 16)
+    JMP(.BETA_SCALE)
+
+    LABEL(.ALPHA_GENERAL)
+    CMP(IMM(2), AL) // Checking if alpha == BLIS_MUL_DEFAULT
+    JNE(.BETA_SCALE)
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6, 8, 10)
+    ALPHA_GENERIC(12, 14, 16)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+    CMP(IMM(0x01), AL) // Checking if beta == 1
+    JE(.ADD)
+    CMP(IMM(0xFF), AL) // Checking if beta == -1
+    JNE(.BETA_GENERAL)
+
+    // Subtracting C from alpha*A*B, one column at a time
+    BETA_MINUS_ONE(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_MINUS_ONE(RCX, 11, 12, 13, 14, 15, 16)
+    JMP(.END)
+
+    LABEL(.BETA_GENERAL) // Checking if beta == BLIS_MUL_DEFAULT
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12, 13, 14, 15, 16)
+    JMP(.END)
+
+    // Handling when beta == 1
+    LABEL(.ADD)
+    // Adding C to alpha*A*B, one column at a time
+    BETA_ONE(RCX, 5, 6, 7, 8, 9, 10)
+    ADD(RSI, RCX)
+    BETA_ONE(RCX, 11, 12, 13, 14, 15, 16)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(8), MEM(RCX, 64))
+    VMOVUPD(ZMM(10), MEM(RCX, 128))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+    VMOVUPD(ZMM(14), MEM(RCX, RSI, 1, 64))
+    VMOVUPD(ZMM(16), MEM(RCX, RSI, 1, 128))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    TRANSPOSE_4x4(8, 14, 20, 26)
+    TRANSPOSE_4x4(10, 16, 22, 28)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+
+      ZMM8
+      ZMM14
+      ZMM20
+      ZMM26
+
+      ZMM10
+      ZMM16
+      ZMM22
+      ZMM28
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 7, 8, 13, 14, 19, 20, 25, 26)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 9, 10, 15, 16, 21, 22, 27, 28)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    LEA(MEM(R9, RDI, 1), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(8), MEM(RCX, RDI, 4) MASK_(k(3)))
+    VMOVUPD(ZMM(10), MEM(RCX, RDI, 8) MASK_(k(3)))
+
+    LEA(MEM(RCX, RDI, 4), RCX)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    VMOVUPD(ZMM(24), MEM(R9) MASK_(k(3)))
+    VMOVUPD(ZMM(14), MEM(R9, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(26), MEM(R9, RDI, 4) MASK_(k(3)))
+    VMOVUPD(ZMM(28), MEM(R9, RDI, 8) MASK_(k(3)))
+
+    LEA(MEM(R9, RDI, 4), R9)
+    LEA(MEM(R9, RDI, 2), R9)
+    VMOVUPD(ZMM(20), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(22), MEM(RCX, RDI, 4) MASK_(k(3)))
+
+    VMOVUPD(ZMM(16), MEM(R9) MASK_(k(3)))
+
+    LABEL(.END)
+    /*
+      Adjusting the addresses for loading the
+      next micro panel from A and the next micro
+      tile from C.
+    */
+    MOV(VAR(ps_a16), RBX)
+    ADD(RBX, R10)
+    LEA(MEM(R12, RDI, 8), R12)
+    LEA(MEM(R12, RDI, 4), R12)
+
+    DEC(R11)
+    JNE(.ZMLOOP)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [m_iter]  "m" (m_iter),
+      [m_left]  "m" (m_left),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [trans_load_mask] "m" (trans_load_mask),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [alpha_mul_type]   "m" (alpha_mul_type),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [ps_a16]   "m" (ps_a16),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+      const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+      dcomplex* restrict cij = c + i_edge * rs_c;
+      dcomplex* restrict ai  = a + m_iter * ps_a;
+      dcomplex* restrict bj  = b;
+
+      if (8 <= m_left)
+      {
+        const dim_t      mr_cur = 8;
+        bli_zgemmsup_cv_zen4_asm_8x2(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if (4 <= m_left)
+      {
+        const dim_t      mr_cur = 4;
+        bli_zgemmsup_cv_zen4_asm_4x2(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if (2 <= m_left)
+      {
+        const dim_t      mr_cur = 2;
+        bli_zgemmsup_cv_zen4_asm_2x2(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if ( 1 == m_left )
+      {
+        bli_zgemv_ex
+        (
+          BLIS_TRANSPOSE, conja, k0, n0,
+          alpha, bj, rs_b0, cs_b0, ai, cs_a0,
+          beta, cij, cs_c0, cntx, NULL
+        );
+      }
+    }
+}
+
+void bli_zgemmsup_cv_zen4_asm_12x1m
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // This kernel is invoked at the beginning of 12x4m
+    // In case of n_left == 1
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    // Obtaining the panel stride for A, In case of packing.
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a16  = ps_a * sizeof( dcomplex );
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+    uint64_t m_iter = m0 / MR; // To be used for MR loop in the kernel
+    uint64_t m_left = m0 % MR;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    /*
+      The mask bits below are set for ensuring ?x1 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint64_t trans_load_mask = 0x03; // mask for transposing and loading = 0b 00 00 00 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 48 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    // Assigning the type of alpha and beta scaling
+    // In order to facilitate handling special cases seperately
+    char alpha_mul_type = BLIS_MUL_DEFAULT;
+    char beta_mul_type  = BLIS_MUL_DEFAULT;
+
+    if(alpha->imag == 0.0)// (alpha is real)
+    {
+        if(alpha->real == 1.0)          alpha_mul_type = BLIS_MUL_ONE;
+        else if(alpha->real == -1.0)    alpha_mul_type = BLIS_MUL_MINUS_ONE;
+    }
+
+    if(beta->imag == 0.0)// (beta is real)
+    {
+        if(beta->real == 1.0)       beta_mul_type = BLIS_MUL_ONE;
+        else if(beta->real == -1.0) beta_mul_type = BLIS_MUL_MINUS_ONE;
+        else if(beta->real == 0.0)  beta_mul_type = BLIS_MUL_ZERO;
+    }
+
+    BEGIN_ASM()
+    MOV(VAR(a), R10)          // R10 = base addr of A (MCXKC block)
+    MOV(VAR(c), R12)          // R12 = base addr of C (MCxNR block)
+
+    MOV(VAR(ps_a16), R11)
+    LEA(MEM(, R11, 8), R11)
+    LEA(MEM(, R11, 2), R11)   // R11 = sizeof(dcomplex)*ps_a16
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), EAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(VAR(m_iter), R11) // Iterating in steps of MR, until MC(m var)
+    LABEL(.ZMLOOP)
+    MOV(R10, RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)  // RBX = addr of B for the KCxNR block
+    MOV(R12, RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(var(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    // Main loop for k
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_12x1
+    MICRO_TILE_12x1
+    MICRO_TILE_12x1
+    MICRO_TILE_12x1
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_12x1
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 3 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6, 8, 10)
+
+    // Final accumulation for A*B on 3 reg using the 6 reg.
+    ACC_COL(5, 6, 7, 8, 9, 10)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6
+      ZMM8
+      ZMM10
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha_mul_type), AL)
+    CMP(IMM(0xFF), AL) // Checking if alpha == -1
+    JNE(.ALPHA_GENERAL)
+    // Handling when alpha == -1
+    VXORPD(ZMM(2), ZMM(2), ZMM(2)) // Resetting ZMM(2) to 0
+
+    // Subtracting C from alpha*A*B, one column at a time
+    ALPHA_MINUS_ONE(6, 8, 10)
+    JMP(.BETA_SCALE)
+
+    LABEL(.ALPHA_GENERAL)
+    CMP(IMM(2), AL) // Checking if alpha == BLIS_MUL_DEFAULT
+    JNE(.BETA_SCALE)
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6, 8, 10)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+    CMP(IMM(0x01), AL) // Checking if beta == 1
+    JE(.ADD)
+    CMP(IMM(0xFF), AL) // Checking if beta == -1
+    JNE(.BETA_GENERAL)
+
+    // Subtracting C from alpha*A*B, one column at a time
+    BETA_MINUS_ONE(RCX, 5, 6, 7, 8, 9, 10)
+    JMP(.END)
+
+    LABEL(.BETA_GENERAL) // Checking if beta == BLIS_MUL_DEFAULT
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6, 7, 8, 9, 10)
+    JMP(.END)
+
+    // Handling when beta == 1
+    LABEL(.ADD)
+    // Adding C to alpha*A*B, one column at a time
+    BETA_ONE(RCX, 5, 6, 7, 8, 9, 10)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(8), MEM(RCX, 64))
+    VMOVUPD(ZMM(10), MEM(RCX, 128))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    TRANSPOSE_4x4(8, 14, 20, 26)
+    TRANSPOSE_4x4(10, 16, 22, 28)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+
+      ZMM8
+      ZMM14
+      ZMM20
+      ZMM26
+
+      ZMM10
+      ZMM16
+      ZMM22
+      ZMM28
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 7, 8, 13, 14, 19, 20, 25, 26)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 9, 10, 15, 16, 21, 22, 27, 28)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    LEA(MEM(R9, RDI, 1), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(8), MEM(RCX, RDI, 4) MASK_(k(3)))
+    VMOVUPD(ZMM(10), MEM(RCX, RDI, 8) MASK_(k(3)))
+
+    LEA(MEM(RCX, RDI, 4), RCX)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    VMOVUPD(ZMM(24), MEM(R9) MASK_(k(3)))
+    VMOVUPD(ZMM(14), MEM(R9, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(26), MEM(R9, RDI, 4) MASK_(k(3)))
+    VMOVUPD(ZMM(28), MEM(R9, RDI, 8) MASK_(k(3)))
+
+    LEA(MEM(R9, RDI, 4), R9)
+    LEA(MEM(R9, RDI, 2), R9)
+    VMOVUPD(ZMM(20), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(22), MEM(RCX, RDI, 4) MASK_(k(3)))
+
+    VMOVUPD(ZMM(16), MEM(R9) MASK_(k(3)))
+
+    LABEL(.END)
+    /*
+      Adjusting the addresses for loading the
+      next micro panel from A and the next micro
+      tile from C.
+    */
+    MOV(VAR(ps_a16), RBX)
+    ADD(RBX, R10)
+    LEA(MEM(R12, RDI, 8), R12)
+    LEA(MEM(R12, RDI, 4), R12)
+
+    DEC(R11)
+    JNE(.ZMLOOP)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [m_iter]  "m" (m_iter),
+      [m_left]  "m" (m_left),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [trans_load_mask] "m" (trans_load_mask),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [alpha_mul_type]   "m" (alpha_mul_type),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [ps_a16]   "m" (ps_a16),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:;
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+      const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+      dcomplex* restrict cij = c + i_edge * rs_c;
+      dcomplex* restrict ai  = a + m_iter * ps_a;
+      dcomplex* restrict bj  = b;
+
+      if (8 <= m_left)
+      {
+        const dim_t      mr_cur = 8;
+        bli_zgemmsup_cv_zen4_asm_8x1(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if (4 <= m_left)
+      {
+        const dim_t      mr_cur = 4;
+        bli_zgemmsup_cv_zen4_asm_4x1(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if (2 <= m_left)
+      {
+        const dim_t      mr_cur = 2;
+        bli_zgemmsup_cv_zen4_asm_2x1(conja, conjb, mr_cur, n0, k0, alpha,
+                                      ai, rs_a0, cs_a0,
+                                      bj, rs_b0, cs_b0,
+                                      beta,
+                                      cij, rs_c0, cs_c0,
+                                      data, cntx);
+        cij += mr_cur * rs_c; ai += mr_cur * rs_a;
+        m_left -= mr_cur;
+      }
+      if ( 1 == m_left )
+      {
+        bli_zgemv_ex
+        (
+          BLIS_TRANSPOSE, conja, k0, n0,
+          alpha, bj, rs_b0, cs_b0, ai, cs_a0,
+          beta, cij, cs_c0, cntx, NULL
+        );
+      }
+    }
+}
+
+void bli_zgemmsup_cv_zen4_asm_8x4
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_8x4
+    MICRO_TILE_8x4
+    MICRO_TILE_8x4
+    MICRO_TILE_8x4
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_8x4
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 8 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6, 8)
+    PERMUTE(12, 14)
+    PERMUTE(18, 20)
+    PERMUTE(24, 26)
+
+    // Final accumulation for A*B on 8 reg using the 16 reg.
+    ACC_COL(5, 6, 7, 8)
+    ACC_COL(11, 12, 13, 14)
+    ACC_COL(17, 18, 19, 20)
+    ACC_COL(23, 24, 25, 26)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12  ZMM18  ZMM24
+      ZMM8  ZMM14  ZMM20  ZMM26
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6, 8)
+    ALPHA_GENERIC(12, 14)
+    ALPHA_GENERIC(18, 20)
+    ALPHA_GENERIC(24, 26)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6, 7, 8)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12, 13, 14)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 17, 18, 19, 20)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 23, 24, 25, 26)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    LEA(MEM(RCX, RSI, 2), R9)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(8), MEM(RCX, 64))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+    VMOVUPD(ZMM(14), MEM(RCX, RSI, 1, 64))
+
+    VMOVUPD(ZMM(18), MEM(R9))
+    VMOVUPD(ZMM(20), MEM(R9, 64))
+
+    VMOVUPD(ZMM(24), MEM(R9, RSI, 1))
+    VMOVUPD(ZMM(26), MEM(R9, RSI, 1, 64))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    TRANSPOSE_4x4(8, 14, 20, 26)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+
+      ZMM8
+      ZMM14
+      ZMM20
+      ZMM26
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_4x4(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_4x4(R9, 7, 8, 13, 14, 19, 20, 25, 26)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    LEA(MEM(R9, RDI, 1), R9)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2))
+    VMOVUPD(ZMM(8), MEM(RCX, RDI, 4))
+
+    LEA(MEM(RCX, RDI, 4), RCX)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    VMOVUPD(ZMM(24), MEM(R9))
+    VMOVUPD(ZMM(14), MEM(R9, RDI, 2))
+    VMOVUPD(ZMM(26), MEM(R9, RDI, 4))
+
+    VMOVUPD(ZMM(20), MEM(RCX))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_8x3
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    /*
+      The mask bits below are set for ensuring ?x3 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint64_t trans_load_mask = 0x3F; // mask for transposing and loading = 0b 00 11 11 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 16 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), EAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_8x3
+    MICRO_TILE_8x3
+    MICRO_TILE_8x3
+    MICRO_TILE_8x3
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_8x3
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 6 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6, 8)
+    PERMUTE(12, 14)
+    PERMUTE(18, 20)
+
+    // Final accumulation for A*B on 6 reg using the 12 reg.
+    ACC_COL(5, 6, 7, 8)
+    ACC_COL(11, 12, 13, 14)
+    ACC_COL(17, 18, 19, 20)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12  ZMM18
+      ZMM8  ZMM14  ZMM20
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6, 8)
+    ALPHA_GENERIC(12, 14)
+    ALPHA_GENERIC(18, 20)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6, 7, 8)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12, 13, 14)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 17, 18, 19, 20)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(8), MEM(RCX, 64))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+    VMOVUPD(ZMM(14), MEM(RCX, RSI, 1, 64))
+
+    VMOVUPD(ZMM(18), MEM(RCX, RSI, 2))
+    VMOVUPD(ZMM(20), MEM(RCX, RSI, 2, 64))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    TRANSPOSE_4x4(8, 14, 20, 26)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+
+      ZMM8
+      ZMM14
+      ZMM20
+      ZMM26
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 7, 8, 13, 14, 19, 20, 25, 26)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    LEA(MEM(R9, RDI, 1), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(8), MEM(RCX, RDI, 4) MASK_(k(3)))
+
+    LEA(MEM(RCX, RDI, 4), RCX)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    VMOVUPD(ZMM(24), MEM(R9) MASK_(k(3)))
+    VMOVUPD(ZMM(14), MEM(R9, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(26), MEM(R9, RDI, 4) MASK_(k(3)))
+
+    VMOVUPD(ZMM(20), MEM(RCX) MASK_(k(3)))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [trans_load_mask] "m" (trans_load_mask),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_8x2
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    /*
+      The mask bits below are set for ensuring ?x2 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint64_t trans_load_mask = 0x0F; // mask for transposing and loading = 0b 00 00 11 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 32 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), RAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_8x2
+    MICRO_TILE_8x2
+    MICRO_TILE_8x2
+    MICRO_TILE_8x2
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_8x2
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 4 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6, 8)
+    PERMUTE(12, 14)
+
+    // Final accumulation for A*B on 4 reg using the 8 reg.
+    ACC_COL(5, 6, 7, 8)
+    ACC_COL(11, 12, 13, 14)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12
+      ZMM8  ZMM14
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6, 8)
+    ALPHA_GENERIC(12, 14)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6, 7, 8)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12, 13, 14)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(8), MEM(RCX, 64))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+    VMOVUPD(ZMM(14), MEM(RCX, RSI, 1, 64))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    TRANSPOSE_4x4(8, 14, 20, 26)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+
+      ZMM8
+      ZMM14
+      ZMM20
+      ZMM26
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 7, 8, 13, 14, 19, 20, 25, 26)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    LEA(MEM(R9, RDI, 1), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(8), MEM(RCX, RDI, 4) MASK_(k(3)))
+
+    LEA(MEM(RCX, RDI, 4), RCX)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    VMOVUPD(ZMM(24), MEM(R9) MASK_(k(3)))
+    VMOVUPD(ZMM(14), MEM(R9, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(26), MEM(R9, RDI, 4) MASK_(k(3)))
+
+    VMOVUPD(ZMM(20), MEM(RCX) MASK_(k(3)))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [trans_load_mask] "m" (trans_load_mask),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_8x1
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    /*
+      The mask bits below are set for ensuring ?x1 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint64_t trans_load_mask = 0x03; // mask for transposing and loading = 0b 00 00 00 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 48 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), EAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_8x1
+    MICRO_TILE_8x1
+    MICRO_TILE_8x1
+    MICRO_TILE_8x1
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_8x1
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 2 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6, 8)
+
+    // Final accumulation for A*B on 2 reg using the 4 reg.
+    ACC_COL(5, 6, 7, 8)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6
+      ZMM8
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6, 8)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6, 7, 8)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(8), MEM(RCX, 64))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    TRANSPOSE_4x4(8, 14, 20, 26)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+
+      ZMM8
+      ZMM14
+      ZMM20
+      ZMM26
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    LEA(MEM(R9, RDI, 2), R9)
+    BETA_GEN_ROW_MASK(R9, 7, 8, 13, 14, 19, 20, 25, 26)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    LEA(MEM(R9, RDI, 1), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(8), MEM(RCX, RDI, 4) MASK_(k(3)))
+
+    LEA(MEM(RCX, RDI, 4), RCX)
+    LEA(MEM(RCX, RDI, 2), RCX)
+    VMOVUPD(ZMM(24), MEM(R9) MASK_(k(3)))
+    VMOVUPD(ZMM(14), MEM(R9, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(26), MEM(R9, RDI, 4) MASK_(k(3)))
+
+    VMOVUPD(ZMM(20), MEM(RCX) MASK_(k(3)))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [trans_load_mask] "m" (trans_load_mask),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_4x4
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_4x4
+    MICRO_TILE_4x4
+    MICRO_TILE_4x4
+    MICRO_TILE_4x4
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_4x4
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 4 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6)
+    PERMUTE(12)
+    PERMUTE(18)
+    PERMUTE(24)
+
+    // Final accumulation for A*B on 4 reg using the 8 reg.
+    ACC_COL(5, 6)
+    ACC_COL(11, 12)
+    ACC_COL(17, 18)
+    ACC_COL(23, 24)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12  ZMM18  ZMM24
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6)
+    ALPHA_GENERIC(12)
+    ALPHA_GENERIC(18)
+    ALPHA_GENERIC(24)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 17, 18)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 23, 24)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    LEA(MEM(RCX, RSI, 2), R9)
+    VMOVUPD(ZMM(6), MEM(RCX))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+
+    VMOVUPD(ZMM(18), MEM(R9))
+
+    VMOVUPD(ZMM(24), MEM(R9, RSI, 1))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_4x4(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2))
+    VMOVUPD(ZMM(24), MEM(R9, RDI, 1))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_4x3
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    /*
+      The mask bits below are set for ensuring ?x3 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint64_t trans_load_mask = 0x3F; // mask for transposing and loading = 0b 00 11 11 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 16 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), EAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_4x3
+    MICRO_TILE_4x3
+    MICRO_TILE_4x3
+    MICRO_TILE_4x3
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_4x3
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 3 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6)
+    PERMUTE(12)
+    PERMUTE(18)
+
+    // Final accumulation for A*B on 3 reg using the 6 reg.
+    ACC_COL(5, 6)
+    ACC_COL(11, 12)
+    ACC_COL(17, 18)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12  ZMM18
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6)
+    ALPHA_GENERIC(12)
+    ALPHA_GENERIC(18)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 17, 18)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+
+    VMOVUPD(ZMM(18), MEM(RCX, RSI, 2))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(24), MEM(R9, RDI, 1) MASK_(k(3)))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha]  "m" (alpha),
+      [trans_load_mask] "m" (trans_load_mask),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_4x2
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    /*
+      The mask bits below are set for ensuring ?x2 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint64_t trans_load_mask = 0x0F; // mask for transposing and loading = 0b 00 00 11 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 32 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), RAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_4x2
+    MICRO_TILE_4x2
+    MICRO_TILE_4x2
+    MICRO_TILE_4x2
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_4x2
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 2 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6)
+    PERMUTE(12)
+
+    // Final accumulation for A*B on 2 reg using the 2 reg.
+    ACC_COL(5, 6)
+    ACC_COL(11, 12)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6  ZMM12
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6)
+    ALPHA_GENERIC(12)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6)
+    ADD(RSI, RCX)
+    BETA_GENERIC(RCX, 11, 12)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+
+    VMOVUPD(ZMM(12), MEM(RCX, RSI, 1))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(24), MEM(R9, RDI, 1) MASK_(k(3)))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha]  "m" (alpha),
+      [trans_load_mask] "m" (trans_load_mask),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_4x1
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    /*
+      The mask bits below are set for ensuring ?x1 compatability
+      while transposing, and loading/storing C in case of row-storage(k(3) opmask register).
+      Mask is of length 8-bits, sinze a ZMM register holds 8 double precision elements.
+    */
+    uint64_t trans_load_mask = 0x03; // mask for transposing and loading = 0b 00 00 00 11
+    /*
+      This mask ensures that the ZMM registers disregard the last 48 bytes while
+      using masked load/stores or FMA operations.
+    */
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    MOV(VAR(trans_load_mask), EAX)
+    KMOVW(EAX, k(3))               // k(3) = trans_load_mask
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), ZMM(29)) // Broadcasting 1.0 over ZMM(29)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    RESET_REGISTERS
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    MICRO_TILE_4x1
+    MICRO_TILE_4x1
+    MICRO_TILE_4x1
+    MICRO_TILE_4x1
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    MICRO_TILE_4x1
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 1 register
+    // Shuffling the registers FMAed with imaginary components in B.
+    PERMUTE(6)
+
+    // Final accumulation for A*B on 1 reg using the 2 reg.
+    ACC_COL(5, 6)
+
+    // A*B is accumulated over the ZMM registers as follows :
+    /*
+      ZMM6
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), ZMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), ZMM(1)) // Alpha->imag
+
+    ALPHA_GENERIC(6)
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    // Scaling C with beta, one column at a time
+    BETA_GENERIC(RCX, 5, 6)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE)
+    VMOVUPD(ZMM(6), MEM(RCX))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    /*
+      In-register transposition happens over the 12x4 micro-tile
+      in blocks of 4x4.
+    */
+    TRANSPOSE_4x4(6, 12, 18, 24)
+    /*
+      The layout post transposition and accumalation is as follows:
+      ZMM6
+      ZMM12
+      ZMM18
+      ZMM24
+    */
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), ZMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), ZMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_MASK(R9, 5, 6, 11, 12, 17, 18, 23, 24)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    LEA(MEM(RCX, RDI, 2), R9)
+    VMOVUPD(ZMM(6), MEM(RCX) MASK_(k(3)))
+    VMOVUPD(ZMM(12), MEM(RCX, RDI, 1) MASK_(k(3)))
+    VMOVUPD(ZMM(18), MEM(RCX, RDI, 2) MASK_(k(3)))
+    VMOVUPD(ZMM(24), MEM(R9, RDI, 1) MASK_(k(3)))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [trans_load_mask] "m" (trans_load_mask),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7",
+      "zmm8", "zmm9", "zmm10", "zmm11",
+      "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23",
+      "zmm24", "zmm25", "zmm26", "zmm27",
+      "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_2x4
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), YMM(2)) // Broadcasting 1.0 over YMM(2)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    VXORPD(YMM(5), YMM(5), YMM(5))
+    VXORPD(YMM(6), YMM(6), YMM(6))
+    VXORPD(YMM(7), YMM(7), YMM(7))
+    VXORPD(YMM(8), YMM(8), YMM(8))
+    VXORPD(YMM(9), YMM(9), YMM(9))
+    VXORPD(YMM(10), YMM(10), YMM(10))
+    VXORPD(YMM(11), YMM(11), YMM(11))
+    VXORPD(YMM(12), YMM(12), YMM(12))
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    LEA(MEM(RBX, R15, 2), R9)
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(R9), YMM(3))
+    VBROADCASTSD(MEM(R9, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(R9, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(R9, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    VFMADD231PD(YMM(0), YMM(13), YMM(11))
+    VFMADD231PD(YMM(0), YMM(14), YMM(12))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    LEA(MEM(RBX, R15, 2), R9)
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(R9), YMM(3))
+    VBROADCASTSD(MEM(R9, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(R9, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(R9, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    VFMADD231PD(YMM(0), YMM(13), YMM(11))
+    VFMADD231PD(YMM(0), YMM(14), YMM(12))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    LEA(MEM(RBX, R15, 2), R9)
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(R9), YMM(3))
+    VBROADCASTSD(MEM(R9, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(R9, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(R9, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    VFMADD231PD(YMM(0), YMM(13), YMM(11))
+    VFMADD231PD(YMM(0), YMM(14), YMM(12))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    LEA(MEM(RBX, R15, 2), R9)
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(R9), YMM(3))
+    VBROADCASTSD(MEM(R9, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(R9, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(R9, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    VFMADD231PD(YMM(0), YMM(13), YMM(11))
+    VFMADD231PD(YMM(0), YMM(14), YMM(12))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    LEA(MEM(RBX, R15, 2), R9)
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(R9), YMM(3))
+    VBROADCASTSD(MEM(R9, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(R9, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(R9, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    VFMADD231PD(YMM(0), YMM(13), YMM(11))
+    VFMADD231PD(YMM(0), YMM(14), YMM(12))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 4 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    VPERMILPD(IMM(0x5), YMM(6), YMM(6))
+    VPERMILPD(IMM(0x5), YMM(8), YMM(8))
+    VPERMILPD(IMM(0x5), YMM(10), YMM(10))
+    VPERMILPD(IMM(0x5), YMM(12), YMM(12))
+
+    // Final accumulation for A*B on 4 reg using the 8 reg.
+    VADDSUBPD(YMM(6), YMM(5), YMM(6))
+    VADDSUBPD(YMM(8), YMM(7), YMM(8))
+    VADDSUBPD(YMM(10), YMM(9), YMM(10))
+    VADDSUBPD(YMM(12), YMM(11), YMM(12))
+
+    // A*B is accumulated over the YMM registers as follows :
+    /*
+      YMM6  YMM8  YMM10  YMM12
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), YMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), YMM(1)) // Alpha->imag
+
+    VMULPD(YMM(0), YMM(6), YMM(15))
+    VMULPD(YMM(1), YMM(6), YMM(6))
+    VPERMILPD(IMM(0x5), YMM(6), YMM(6))
+    VADDSUBPD(YMM(6), YMM(15), YMM(6))
+
+    VMULPD(YMM(0), YMM(8), YMM(15))
+    VMULPD(YMM(1), YMM(8), YMM(8))
+    VPERMILPD(IMM(0x5), YMM(8), YMM(8))
+    VADDSUBPD(YMM(8), YMM(15), YMM(8))
+
+    VMULPD(YMM(0), YMM(10), YMM(15))
+    VMULPD(YMM(1), YMM(10), YMM(10))
+    VPERMILPD(IMM(0x5), YMM(10), YMM(10))
+    VADDSUBPD(YMM(10), YMM(15), YMM(10))
+
+    VMULPD(YMM(0), YMM(12), YMM(15))
+    VMULPD(YMM(1), YMM(12), YMM(12))
+    VPERMILPD(IMM(0x5), YMM(12), YMM(12))
+    VADDSUBPD(YMM(12), YMM(15), YMM(12))
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), YMM(0))  // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), YMM(1)) // Beta->imag
+
+    VMOVUPD(MEM(RCX), YMM(5))
+    VMULPD(YMM(0), YMM(5), YMM(15))
+    VMULPD(YMM(1), YMM(5), YMM(5))
+    VPERMILPD(IMM(0x5), YMM(5), YMM(5))
+    VADDSUBPD(YMM(5), YMM(15), YMM(5))
+    VADDPD(YMM(5), YMM(6), YMM(6))
+    VMOVUPD(YMM(6), MEM(RCX))
+    ADD(RSI, RCX)
+
+    VMOVUPD(MEM(RCX), YMM(7))
+    VMULPD(YMM(0), YMM(7), YMM(15))
+    VMULPD(YMM(1), YMM(7), YMM(7))
+    VPERMILPD(IMM(0x5), YMM(7), YMM(7))
+    VADDSUBPD(YMM(7), YMM(15), YMM(7))
+    VADDPD(YMM(7), YMM(8), YMM(8))
+    VMOVUPD(YMM(8), MEM(RCX))
+    ADD(RSI, RCX)
+
+    VMOVUPD(MEM(RCX), YMM(9))
+    VMULPD(YMM(0), YMM(9), YMM(15))
+    VMULPD(YMM(1), YMM(9), YMM(9))
+    VPERMILPD(IMM(0x5), YMM(9), YMM(9))
+    VADDSUBPD(YMM(9), YMM(15), YMM(9))
+    VADDPD(YMM(9), YMM(10), YMM(10))
+    VMOVUPD(YMM(10), MEM(RCX))
+    ADD(RSI, RCX)
+
+    VMOVUPD(MEM(RCX), YMM(11))
+    VMULPD(YMM(0), YMM(11), YMM(15))
+    VMULPD(YMM(1), YMM(11), YMM(11))
+    VPERMILPD(IMM(0x5), YMM(11), YMM(11))
+    VADDSUBPD(YMM(11), YMM(15), YMM(11))
+    VADDPD(YMM(11), YMM(12), YMM(12))
+    VMOVUPD(YMM(12), MEM(RCX))
+    JMP(.END)
+
+    LABEL(.STORE)
+    VMOVUPD(YMM(6), MEM(RCX))
+    ADD(RSI, RCX)
+    VMOVUPD(YMM(8), MEM(RCX))
+    ADD(RSI, RCX)
+    VMOVUPD(YMM(10), MEM(RCX))
+    ADD(RSI, RCX)
+    VMOVUPD(YMM(12), MEM(RCX))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    TRANSPOSE_2x2(6, 8)
+    TRANSPOSE_2x2(10, 12)
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), YMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), YMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_2x4(R9, 5, 6, 9, 10)
+    ADD(RDI, R9)
+    BETA_GEN_ROW_2x4(R9, 7, 8, 11, 12)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    VMOVUPD(YMM(6), MEM(RCX))
+    VMOVUPD(YMM(10), MEM(RCX, RSI, 2))
+    ADD(RDI, RCX)
+    VMOVUPD(YMM(8), MEM(RCX))
+    VMOVUPD(YMM(12), MEM(RCX, RSI, 2))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_2x3
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), YMM(2)) // Broadcasting 1.0 over YMM(2)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    VXORPD(YMM(5), YMM(5), YMM(5))
+    VXORPD(YMM(6), YMM(6), YMM(6))
+    VXORPD(YMM(7), YMM(7), YMM(7))
+    VXORPD(YMM(8), YMM(8), YMM(8))
+    VXORPD(YMM(9), YMM(9), YMM(9))
+    VXORPD(YMM(10), YMM(10), YMM(10))
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    /* Macro for 2x3 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(RBX, R15, 2), YMM(3))
+    VBROADCASTSD(MEM(RBX, R15, 2, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x3 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(RBX, R15, 2), YMM(3))
+    VBROADCASTSD(MEM(RBX, R15, 2, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x3 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(RBX, R15, 2), YMM(3))
+    VBROADCASTSD(MEM(RBX, R15, 2, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x3 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(RBX, R15, 2), YMM(3))
+    VBROADCASTSD(MEM(RBX, R15, 2, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    /* Macro for 2x3 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Prebroadcasting B on YMM(3) and YMM(4) */
+    VBROADCASTSD(MEM(RBX, R15, 2), YMM(3))
+    VBROADCASTSD(MEM(RBX, R15, 2, 8), YMM(4))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    VFMADD231PD(YMM(0), YMM(3), YMM(9))
+    VFMADD231PD(YMM(0), YMM(4), YMM(10))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 3 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    VPERMILPD(IMM(0x5), YMM(6), YMM(6))
+    VPERMILPD(IMM(0x5), YMM(8), YMM(8))
+    VPERMILPD(IMM(0x5), YMM(10), YMM(10))
+
+    // Final accumulation for A*B on 3 reg using the 6 reg.
+    VADDSUBPD(YMM(6), YMM(5), YMM(6))
+    VADDSUBPD(YMM(8), YMM(7), YMM(8))
+    VADDSUBPD(YMM(10), YMM(9), YMM(10))
+
+    // A*B is accumulated over the YMM registers as follows :
+    /*
+      YMM6  YMM8  YMM10
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), YMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), YMM(1)) // Alpha->imag
+
+    VMULPD(YMM(0), YMM(6), YMM(15))
+    VMULPD(YMM(1), YMM(6), YMM(6))
+    VPERMILPD(IMM(0x5), YMM(6), YMM(6))
+    VADDSUBPD(YMM(6), YMM(15), YMM(6))
+
+    VMULPD(YMM(0), YMM(8), YMM(15))
+    VMULPD(YMM(1), YMM(8), YMM(8))
+    VPERMILPD(IMM(0x5), YMM(8), YMM(8))
+    VADDSUBPD(YMM(8), YMM(15), YMM(8))
+
+    VMULPD(YMM(0), YMM(10), YMM(15))
+    VMULPD(YMM(1), YMM(10), YMM(10))
+    VPERMILPD(IMM(0x5), YMM(10), YMM(10))
+    VADDSUBPD(YMM(10), YMM(15), YMM(10))
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), YMM(0))  // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), YMM(1)) // Beta->imag
+
+    VMOVUPD(MEM(RCX), YMM(5))
+    VMULPD(YMM(0), YMM(5), YMM(15))
+    VMULPD(YMM(1), YMM(5), YMM(5))
+    VPERMILPD(IMM(0x5), YMM(5), YMM(5))
+    VADDSUBPD(YMM(5), YMM(15), YMM(5))
+    VADDPD(YMM(5), YMM(6), YMM(6))
+    VMOVUPD(YMM(6), MEM(RCX))
+    ADD(RSI, RCX)
+
+    VMOVUPD(MEM(RCX), YMM(7))
+    VMULPD(YMM(0), YMM(7), YMM(15))
+    VMULPD(YMM(1), YMM(7), YMM(7))
+    VPERMILPD(IMM(0x5), YMM(7), YMM(7))
+    VADDSUBPD(YMM(7), YMM(15), YMM(7))
+    VADDPD(YMM(7), YMM(8), YMM(8))
+    VMOVUPD(YMM(8), MEM(RCX))
+    ADD(RSI, RCX)
+
+    VMOVUPD(MEM(RCX), YMM(9))
+    VMULPD(YMM(0), YMM(9), YMM(15))
+    VMULPD(YMM(1), YMM(9), YMM(9))
+    VPERMILPD(IMM(0x5), YMM(9), YMM(9))
+    VADDSUBPD(YMM(9), YMM(15), YMM(9))
+    VADDPD(YMM(9), YMM(10), YMM(10))
+    VMOVUPD(YMM(10), MEM(RCX))
+    JMP(.END)
+
+    LABEL(.STORE)
+    VMOVUPD(YMM(6), MEM(RCX))
+    ADD(RSI, RCX)
+    VMOVUPD(YMM(8), MEM(RCX))
+    ADD(RSI, RCX)
+    VMOVUPD(YMM(10), MEM(RCX))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    TRANSPOSE_2x2(6, 8)
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), YMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), YMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_2x3(R9, 5, 6, 7, 8, 9, 10)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    VEXTRACTF128(IMM(0x1), YMM(10), XMM(9))
+    VMOVUPD(YMM(6), MEM(RCX))
+    VMOVUPD(XMM(10), MEM(RCX, RSI, 2))
+    ADD(RDI, RCX)
+    VMOVUPD(YMM(8), MEM(RCX))
+    VMOVUPD(XMM(9), MEM(RCX, RSI, 2))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_2x2
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), YMM(2)) // Broadcasting 1.0 over YMM(2)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)  // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    VXORPD(YMM(5), YMM(5), YMM(5))
+    VXORPD(YMM(6), YMM(6), YMM(6))
+    VXORPD(YMM(7), YMM(7), YMM(7))
+    VXORPD(YMM(8), YMM(8), YMM(8))
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+   /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    /* Prebroadcasting B on YMM(13) and YMM(14) */
+    VBROADCASTSD(MEM(RBX, R15, 1), YMM(13))
+    VBROADCASTSD(MEM(RBX, R15, 1, 8), YMM(14))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    VFMADD231PD(YMM(0), YMM(13), YMM(7))
+    VFMADD231PD(YMM(0), YMM(14), YMM(8))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 2 registers
+    // Shuffling the registers FMAed with imaginary components in B.
+    VPERMILPD(IMM(0x5), YMM(6), YMM(6))
+    VPERMILPD(IMM(0x5), YMM(8), YMM(8))
+
+    // Final accumulation for A*B on 2 reg using the 4 reg.
+    VADDSUBPD(YMM(6), YMM(5), YMM(6))
+    VADDSUBPD(YMM(8), YMM(7), YMM(8))
+
+    // A*B is accumulated over the YMM registers as follows :
+    /*
+      YMM6  YMM8
+    */
+
+     // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), YMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), YMM(1)) // Alpha->imag
+
+    VMULPD(YMM(0), YMM(6), YMM(15))
+    VMULPD(YMM(1), YMM(6), YMM(6))
+    VPERMILPD(IMM(0x5), YMM(6), YMM(6))
+    VADDSUBPD(YMM(6), YMM(15), YMM(6))
+
+    VMULPD(YMM(0), YMM(8), YMM(15))
+    VMULPD(YMM(1), YMM(8), YMM(8))
+    VPERMILPD(IMM(0x5), YMM(8), YMM(8))
+    VADDSUBPD(YMM(8), YMM(15), YMM(8))
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), YMM(0))  // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), YMM(1)) // Beta->imag
+
+    VMOVUPD(MEM(RCX), YMM(5))
+    VMULPD(YMM(0), YMM(5), YMM(15))
+    VMULPD(YMM(1), YMM(5), YMM(5))
+    VPERMILPD(IMM(0x5), YMM(5), YMM(5))
+    VADDSUBPD(YMM(5), YMM(15), YMM(5))
+    VADDPD(YMM(5), YMM(6), YMM(6))
+    VMOVUPD(YMM(6), MEM(RCX))
+    ADD(RSI, RCX)
+
+    VMOVUPD(MEM(RCX), YMM(7))
+    VMULPD(YMM(0), YMM(7), YMM(15))
+    VMULPD(YMM(1), YMM(7), YMM(7))
+    VPERMILPD(IMM(0x5), YMM(7), YMM(7))
+    VADDSUBPD(YMM(7), YMM(15), YMM(7))
+    VADDPD(YMM(7), YMM(8), YMM(8))
+    VMOVUPD(YMM(8), MEM(RCX))
+    JMP(.END)
+
+    LABEL(.STORE)
+    VMOVUPD(YMM(6), MEM(RCX))
+    ADD(RSI, RCX)
+    VMOVUPD(YMM(8), MEM(RCX))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    TRANSPOSE_2x2(6, 8)
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), YMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), YMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_2x2(R9, 5, 6, 7, 8)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    VMOVUPD(YMM(6), MEM(RCX))
+    ADD(RDI, RCX)
+    VMOVUPD(YMM(8), MEM(RCX))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+}
+
+void bli_zgemmsup_cv_zen4_asm_2x1
+     (
+       conj_t       conja,
+       conj_t       conjb,
+       dim_t        m0,
+       dim_t        n0,
+       dim_t        k0,
+       dcomplex*    restrict alpha,
+       dcomplex*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       dcomplex*    restrict beta,
+       dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    // Main kernel
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t k_iter = k0 / 4; // Unroll factor of 4
+    uint64_t k_left = k0 % 4;
+
+    const double value = 1.0; // To be broadcasted and used for complex arithmetic
+    const double *v = &value;
+
+    // Assigning the type of beta scaling for enabling loading of C
+    char beta_mul_type = (beta->real == 0.0 && beta->imag == 0.0)? BLIS_MUL_ZERO : BLIS_MUL_DEFAULT;
+
+    BEGIN_ASM()
+
+    MOV(VAR(cs_a), R13)
+    LEA(MEM(, R13, 8), R13)
+    LEA(MEM(, R13, 2), R13)   // R13 = sizeof(dcomplex)*cs_a
+
+    MOV(VAR(rs_b), R14)
+    LEA(MEM(, R14, 8), R14)
+    LEA(MEM(, R14, 2), R14)   // R14 = sizeof(dcomplex)*rs_b
+
+    MOV(VAR(cs_b), R15)
+    LEA(MEM(, R15, 8), R15)
+    LEA(MEM(, R15, 2), R15)   // R15 = sizeof(dcomplex)*cs_b
+
+    MOV(VAR(rs_c), RDI)
+    LEA(MEM(, RDI, 8), RDI)
+    LEA(MEM(, RDI, 2), RDI)   // RDI = sizeof(dcomplex)*rs_c
+
+    MOV(VAR(cs_c), RSI)
+    LEA(MEM(, RSI, 8), RSI)
+    LEA(MEM(, RSI, 2), RSI)   // RSI = sizeof(dcomplex)*cs_c
+
+    // Intermediate register for complex arithmetic
+    MOV(VAR(v), R9)  // Used in fmaddsub instruction
+    VBROADCASTSD(MEM(R9), YMM(2)) // Broadcasting 1.0 over YMM(2)
+
+    MOV(var(a), RAX)     // RAX = addr of A for the MRxKC block
+    MOV(var(b), RBX)     // RBX = addr of B for the KCxNR block
+    MOV(var(c), RCX)     // RCX = addr of C for the MRxNR block
+
+    // Resetting all scratch registers
+    VXORPD(YMM(5), YMM(5), YMM(5))
+    VXORPD(YMM(6), YMM(6), YMM(6))
+
+    // Setting iterator for k
+    MOV(VAR(k_iter), R8)
+    TEST(R8, R8)
+    JE(.ZKLEFT)
+    LABEL(.ZKITERMAIN)
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    // ----------------------------------------- //
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    DEC(R8)
+    JNZ(.ZKITERMAIN)
+
+    // Remainder loop for k
+    LABEL(.ZKLEFT)
+    MOV(VAR(k_left), R8)
+    TEST(R8, R8)
+    JE(.ACCUMULATE)
+    LABEL(.ZKLEFTLOOP)
+
+    /* Macro for 2x4 micro-tile evaluation   */
+    VBROADCASTSD(MEM(RBX), YMM(3))
+    VBROADCASTSD(MEM(RBX, 8), YMM(4))
+    VMOVUPD(MEM(RAX), YMM(0))
+    VFMADD231PD(YMM(0), YMM(3), YMM(5))
+    VFMADD231PD(YMM(0), YMM(4), YMM(6))
+    /* Adjusting addresses for next micro tiles */
+    ADD(R14, RBX)
+    ADD(R13, RAX)
+
+    DEC(R8)
+    JNZ(.ZKLEFTLOOP)
+
+    LABEL(.ACCUMULATE) // Accumulating A*B over 1 register
+    // Shuffling the registers FMAed with imaginary components in B.
+    VPERMILPD(IMM(0x5), YMM(6), YMM(6))
+
+    // Final accumulation for A*B on 1 reg using the 2 reg.
+    VADDSUBPD(YMM(6), YMM(5), YMM(6))
+
+    // A*B is accumulated over the YMM registers as follows :
+    /*
+      YMM6
+    */
+
+    // Alpha scaling
+    MOV(VAR(alpha), RAX)
+    VBROADCASTSD(MEM(RAX), YMM(0))  // Alpha->real
+    VBROADCASTSD(MEM(RAX, 8), YMM(1)) // Alpha->imag
+
+    VMULPD(YMM(0), YMM(6), YMM(15))
+    VMULPD(YMM(1), YMM(6), YMM(6))
+    VPERMILPD(IMM(0x5), YMM(6), YMM(6))
+    VADDSUBPD(YMM(6), YMM(15), YMM(6))
+
+    // Beta scaling
+    LABEL(.BETA_SCALE)
+    // Checking for storage scheme of C
+    CMP(IMM(16), RSI)
+    JE(.ROW_STORAGE_C)  // Jumping to row storage handling case
+
+    // Beta scaling when C is column stored
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE)
+
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), YMM(0))  // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), YMM(1)) // Beta->imag
+
+    VMOVUPD(MEM(RCX), YMM(5))
+    VMULPD(YMM(0), YMM(5), YMM(15))
+    VMULPD(YMM(1), YMM(5), YMM(5))
+    VPERMILPD(IMM(0x5), YMM(5), YMM(5))
+    VADDSUBPD(YMM(5), YMM(15), YMM(5))
+    VADDPD(YMM(5), YMM(6), YMM(6))
+    VMOVUPD(YMM(6), MEM(RCX))
+
+    LABEL(.STORE)
+    VMOVUPD(YMM(6), MEM(RCX))
+    JMP(.END)
+
+    // Beta scaling when C is row stored
+    LABEL(.ROW_STORAGE_C)
+    VEXTRACTF128(IMM(0x1), YMM(6), XMM(5))
+
+    // Loading C(row stored) and beta scaling
+    MOV(RCX, R9)
+    MOV(VAR(beta_mul_type), AL)
+    CMP(IMM(0), AL)    // Checking if beta == 0
+    JE(.STORE_ROW)
+    MOV(VAR(beta), RBX)
+    VBROADCASTSD(MEM(RBX), YMM(0))    // Beta->real
+    VBROADCASTSD(MEM(RBX, 8), YMM(1)) // Beta->imag
+
+    BETA_GEN_ROW_2x1(R9, 6, 5)
+    JMP(.END)
+
+    // Handling when beta == 0
+    LABEL(.STORE_ROW)
+    VMOVUPD(XMM(6), MEM(RCX))
+    ADD(RDI, RCX)
+    VMOVUPD(XMM(5), MEM(RCX))
+
+    LABEL(.END)
+
+    END_ASM(
+    : // output operands (none)
+    : // input operands
+      [v]  "m" (v),
+      [k_iter]  "m" (k_iter),
+      [k_left]  "m" (k_left),
+      [alpha]  "m" (alpha),
+      [a]      "m" (a),
+      [b]      "m" (b),
+      [beta_mul_type]   "m" (beta_mul_type),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [cs_a]   "m" (cs_a),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "al",
+      "ymm0", "ymm1", "ymm2", "ymm3",
+      "ymm4", "ymm5", "ymm6", "ymm7",
+      "ymm8", "ymm9", "ymm10", "ymm11",
+      "ymm12", "ymm13", "ymm14", "ymm15",
+      "memory"
+    )
+}
diff --git a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c
new file mode 100644
index 0000000000..96fa63e95d
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.c
@@ -0,0 +1,5873 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+#include "bli_gemmsup_rd_zen_s6x64.h"
+
+void bli_sgemmsup_rd_zen_asm_5x64_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // rs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov(imm(0), r15)                    // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea(mem(   , r15, 1), rsi)          // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem( r8, r8, 2 ), r10 )        // r10 = 3 * rs_a
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )         // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load rows from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load columns from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5(  8,  9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5(  8,  9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5(  8,  9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5(  8,  9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5(  8,  9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    vmovups( ( rax,  r8, 4 ), ymm4 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA5(  8,  9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    vmovss( ( rax,  r8, 4 ), xmm4 )
+    add( imm( 1*4 ), rax )
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA5(  8,  9, 10, 20, 21 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 1*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    // The horizontal sum of each ZMM register has the result for a single
+    // element of the C Matrix.
+    // ZMM_TO_YMM adds the upper half of ZMM registers to the lower half of
+    // the respective ZMM registers, thus having the result in the lower half of
+    // ZMM registers which is equivalent to its respective YMM counterpart.
+    // ymm = lo(zmm) + hi(zmm)
+    // zmm8 = z0 z1 z2 z3 z4 z5 z6 z7 z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm0 = z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm8 = z0 z1  z2  z3  z4  z5  z6  z7
+    // ymm0 = ymm0 + ymm8
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    // Accumulates the results by horizontally adding the YMM registers,
+    // and having the final result in xmm registers.
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 21, 24, 27, 30,  8,  9, 10, 11 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+    ACCUM_YMM( 8, 9, 10, 11, 5 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR2                         // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 21, 24, 27, 30,  8,  9, 10, 11 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+    ACCUM_YMM( 8, 9, 10, 11, 5 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ2                      // Storing result to C
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 6*rs_c
+
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 6*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 64 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm21", "ymm23", "ymm24", "ymm26", "ymm27",
+      "ymm29", "ymm30",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_4x64_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem( , r15, 1 ), rsi )         // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )    // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )   // rdi = 5 * rs_b
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+    label( .POST_ACCUM_STOR )       // Accumulating & storing the results when beta != 0
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE     // Scaling the result of A*B with alpha
+
+    C_STOR       // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+
+    ALPHA_SCALE     // Scaling the result of A*B with alpha
+
+    C_STOR1
+
+    jmp( .SDONE )
+
+    label( .POST_ACCUM_STOR_BZ )  // Accumulating & storing the results when beta == 0
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ       // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ1       // Storing result to C
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )         //
+    lea( mem( r12, rdi, 4 ), r12 )         // c_ii = r12 += 3*rs_c
+
+    lea( mem( r14, r8,  2 ), r14 )         //
+    lea( mem( r14, r8,  4 ), r14 )         // a_ii = r14 += 3*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 64 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm23", "ymm26", "ymm29",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_3x64_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+
+    lea( mem( r9, r9, 2 ), r13 )    // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )  // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 3*rs_c
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 3*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 64 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_2x64_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )    // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )   // rdi = 5 * rs_b
+    
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA2( 8, 9 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA2( 14, 15 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 11, 12,  4,  5,  7,  8 )
+    ZMM_TO_YMM( 14, 15, 17, 18, 10, 11, 13, 14 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+
+    ALPHA_SCALE2                // Scaling the result of A*B with alpha
+
+    C_STOR2                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 11, 12,  4,  5,  7,  8 )
+    ZMM_TO_YMM( 14, 15, 17, 18, 10, 11, 13, 14 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+
+    ALPHA_SCALE2                // Scaling the result of A*B with alpha
+
+    C_STOR_BZ2                  // Storing result to C
+
+    label( .SDONE )
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 64 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm17", "ymm18",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_1x64_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+    
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA1( 17 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA1( 8 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA1( 11 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA1( 14 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA1( 17 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM( 8, 11, 14, 17, 4, 7, 10, 13 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+
+    ALPHA_SCALE1                // Scaling the result of A*B with alpha
+
+    C_STOR1                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM( 8, 11, 14, 17, 4, 7, 10, 13 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+
+    ALPHA_SCALE1                // Scaling the result of A*B with alpha
+
+    C_STOR_BZ1                  // Storing result to C
+
+
+    label( .SDONE )
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 64 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm6",
+      "ymm0", "ymm2", "ymm3", "ymm4", "ymm6",
+      "ymm7", "ymm8", "ymm10", "ymm11", "ymm13",
+      "ymm14", "ymm17",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_5x48_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    vmovups( ( rax,  r8, 4 ), ymm4 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    vmovss( ( rax,  r8, 4 ), xmm4 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR                      // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 21, 24, 27, 30,  8,  9, 10, 11 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+    ACCUM_YMM( 8, 9, 10, 11, 5 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR2                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                   // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 21, 24, 27, 30,  8,  9, 10, 11 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+    ACCUM_YMM( 8, 9, 10, 11, 5 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR_BZ2                  // Storing result to C
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 3*rs_c
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 3*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 48 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm21", "ymm23", "ymm24", "ymm26", "ymm27",
+      "ymm29", "ymm30",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_4x48_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE             // Scaling the result of A*B with alpha
+
+    C_STOR                  // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+
+    ALPHA_SCALE             // Scaling the result of A*B with alpha
+
+    C_STOR1                 // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE             // Scaling the result of A*B with alpha
+
+    C_STOR_BZ               // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE             // Scaling the result of A*B with alpha
+
+    C_STOR_BZ1              // Storing result to C
+
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 3*rs_c
+
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 3*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 48 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm23", "ymm26", "ymm29",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_3x48_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )         // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR                      // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                   // Storing result to C
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )                 // load rs_c
+    lea( mem( , rdi, 4 ), rdi )             // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )          // c_ii = r12 += 3*rs_c
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )          // a_ii = r14 += 3*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 48 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_2x48_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )    // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )   // rdi = 5 * rs_b
+    
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA2( 8, 9 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA2( 14, 15 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 11, 12,  4,  5,  7,  8 )
+    ZMM_TO_YMM( 14, 15, 17, 18, 10, 11, 13, 14 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+
+    ALPHA_SCALE2                // Scaling the result of A*B with alpha
+
+    C_STOR2                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 11, 12,  4,  5,  7,  8 )
+    ZMM_TO_YMM( 14, 15, 17, 18, 10, 11, 13, 14 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+
+    ALPHA_SCALE2                // Scaling the result of A*B with alpha
+
+    C_STOR_BZ2                  // Storing result to C
+
+
+    label( .SDONE )
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 48 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm17", "ymm18",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_1x48_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+    
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA1( 17 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA1( 8 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA1( 11 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA1( 14 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA1( 17 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM( 8, 11, 14, 17, 4, 7, 10, 13 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+
+    ALPHA_SCALE1                // Scaling the result of A*B with alpha
+
+    C_STOR1                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM( 8, 11, 14, 17, 4, 7, 10, 13 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+
+    ALPHA_SCALE1                // Scaling the result of A*B with alpha
+
+    C_STOR_BZ1                  // Storing result to C
+
+
+    label( .SDONE )
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 48 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm6",
+      "ymm0", "ymm2", "ymm3", "ymm4", "ymm6",
+      "ymm7", "ymm8", "ymm10", "ymm11", "ymm13", "ymm14", "ymm17",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_5x32_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )    // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )   // rdi = 5 * rs_b
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax, r8, 4 ), zmm4 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    vmovups( ( rax,  r8, 4 ), ymm4 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    vmovss( ( rax,  r8, 4 ), xmm4 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA5( 8, 9, 10, 20, 21 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA5( 11, 12, 13, 23, 24 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA5( 14, 15, 16, 26, 27 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA5( 17, 18, 19, 29, 30 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR                      // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 21, 24, 27, 30,  8,  9, 10, 11 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+    ACCUM_YMM( 8, 9, 10, 11, 5 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR2                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                   // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 21, 24, 27, 30,  8,  9, 10, 11 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+    ACCUM_YMM( 8, 9, 10, 11, 5 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR_BZ2                  // Storing result to C
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )                 // load rs_c
+    lea( mem( , rdi, 4 ), rdi )             // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )          // c_ii = r12 += 3*rs_c
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )          // a_ii = r14 += 3*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 32 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm21", "ymm23", "ymm24", "ymm26", "ymm27",
+      "ymm29", "ymm30",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_4x32_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA4(  8,  9, 10, 20 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA4( 11, 12, 13, 23 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA4( 14, 15, 16, 26 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA4( 17, 18, 19, 29 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )       
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR                      // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR1                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                   // Storing result to C
+
+    ZMM_TO_YMM( 20, 23, 26, 29,  4,  5,  6,  7 )
+
+    ACCUM_YMM( 4, 5, 6, 7, 4 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR_BZ1                  // Storing result to C
+
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 3*rs_c
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 3*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 32 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm23", "ymm26", "ymm29",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_3x32_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA3(  8,  9, 10 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR                      // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                 // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                   // Storing result to C
+
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 3*rs_c
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 3*rs_a
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 32 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_2x32_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+    
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA2( 8, 9 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA2( 14, 15 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 11, 12,  4,  5,  7,  8 )
+    ZMM_TO_YMM( 14, 15, 17, 18, 10, 11, 13, 14 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+
+    ALPHA_SCALE2                // Scaling the result of A*B with alpha
+
+    C_STOR2                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 11, 12,  4,  5,  7,  8 )
+    ZMM_TO_YMM( 14, 15, 17, 18, 10, 11, 13, 14 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+
+    ALPHA_SCALE2                // Scaling the result of A*B with alpha
+
+    C_STOR_BZ2                  // Storing result to C
+
+
+    label( .SDONE )
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 32 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm17", "ymm18",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rd_zen_asm_1x32_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(  , r15, 1 ), rsi )        // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+    
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )         // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA1( 17 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA1( 8 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA1( 11 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA1( 14 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA1( 17 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA1( 8 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA1( 11 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA1( 14 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA1( 17 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM( 8, 11, 14, 17, 4, 7, 10, 13 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+
+    ALPHA_SCALE1                // Scaling the result of A*B with alpha
+
+    C_STOR1                     // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM( 8, 11, 14, 17, 4, 7, 10, 13 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+
+    ALPHA_SCALE1                // Scaling the result of A*B with alpha
+
+    C_STOR_BZ1                  // Storing result to C
+
+
+    label( .SDONE )
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 32 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm6",
+      "ymm0", "ymm2", "ymm3", "ymm4", "ymm6",
+      "ymm7", "ymm8", "ymm10", "ymm11", "ymm13",
+      "ymm14", "ymm17",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
diff --git a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.h b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.h
new file mode 100644
index 0000000000..80e43843cc
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64.h
@@ -0,0 +1,201 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+#define INIT_REG \
+    vxorps( zmm0,zmm0,zmm0 ) \
+    vxorps( zmm1,zmm1,zmm1 ) \
+    vxorps( zmm2,zmm2,zmm2 ) \
+    vxorps( zmm3,zmm3,zmm3 ) \
+    vxorps( zmm4,zmm4,zmm4 ) \
+    vxorps( zmm5,zmm5,zmm5 ) \
+    vxorps( zmm6,zmm6,zmm6 ) \
+    vxorps( zmm7,zmm7,zmm7 ) \
+    vxorps( zmm8,zmm8,zmm8 ) \
+    vxorps( zmm9,zmm9,zmm9 ) \
+    vxorps( zmm10,zmm10,zmm10 ) \
+    vxorps( zmm11,zmm11,zmm11 ) \
+    vxorps( zmm12,zmm12,zmm12 ) \
+    vxorps( zmm13,zmm13,zmm13 ) \
+    vxorps( zmm14,zmm14,zmm14 ) \
+    vxorps( zmm15,zmm15,zmm15 ) \
+    vxorps( zmm16,zmm16,zmm16 ) \
+    vxorps( zmm17,zmm17,zmm17 ) \
+    vxorps( zmm18,zmm18,zmm18 ) \
+    vxorps( zmm19,zmm19,zmm19 ) \
+    vxorps( zmm20,zmm20,zmm20 ) \
+    vxorps( zmm21,zmm21,zmm21 ) \
+    vxorps( zmm22,zmm22,zmm22 ) \
+    vxorps( zmm23,zmm23,zmm23 ) \
+    vxorps( zmm24,zmm24,zmm24 ) \
+    vxorps( zmm25,zmm25,zmm25 ) \
+    vxorps( zmm26,zmm26,zmm26 ) \
+    vxorps( zmm27,zmm27,zmm27 ) \
+    vxorps( zmm28,zmm28,zmm28 ) \
+    vxorps( zmm29,zmm29,zmm29 ) \
+    vxorps( zmm30,zmm30,zmm30 ) \
+    vxorps( zmm31,zmm31,zmm31 )
+
+#define VFMA6( R0, R1, R2, R3, R4, R5 ) \
+    vfmadd231ps( zmm0, zmm6, zmm(R0) ) \
+    vfmadd231ps( zmm1, zmm6, zmm(R1) ) \
+    vfmadd231ps( zmm2, zmm6, zmm(R2) ) \
+    vfmadd231ps( zmm3, zmm6, zmm(R3) ) \
+    vfmadd231ps( zmm4, zmm6, zmm(R4) ) \
+    vfmadd231ps( zmm5, zmm6, zmm(R5) )
+
+#define VFMA5( R0, R1, R2, R3, R4 ) \
+    vfmadd231ps( zmm0, zmm6, zmm(R0) ) \
+    vfmadd231ps( zmm1, zmm6, zmm(R1) ) \
+    vfmadd231ps( zmm2, zmm6, zmm(R2) ) \
+    vfmadd231ps( zmm3, zmm6, zmm(R3) ) \
+    vfmadd231ps( zmm4, zmm6, zmm(R4) )
+
+#define VFMA4( R0, R1, R2, R3 ) \
+    vfmadd231ps( zmm0, zmm6, zmm(R0) ) \
+    vfmadd231ps( zmm1, zmm6, zmm(R1) ) \
+    vfmadd231ps( zmm2, zmm6, zmm(R2) ) \
+    vfmadd231ps( zmm3, zmm6, zmm(R3) )
+
+#define VFMA3( R0, R1, R2 ) \
+    vfmadd231ps( zmm0, zmm6, zmm(R0) ) \
+    vfmadd231ps( zmm1, zmm6, zmm(R1) ) \
+    vfmadd231ps( zmm2, zmm6, zmm(R2) )
+
+#define VFMA2( R0, R1 ) \
+    vfmadd231ps( zmm0, zmm6, zmm(R0) ) \
+    vfmadd231ps( zmm1, zmm6, zmm(R1) )
+
+#define VFMA1( R0 ) \
+    vfmadd231ps( zmm0, zmm6, zmm(R0) )
+
+#define ZMM_TO_YMM( R0, R1, R2, R3, R4, R5, R6, R7 ) \
+    VEXTRACTF32X8( imm(0x01), zmm(R0), ymm0 ) \
+    VADDPS( ymm0, ymm(R0), ymm(R4) ) \
+    VEXTRACTF32X8( imm(0x01), zmm(R1), ymm1 ) \
+    VADDPS( ymm1, ymm(R1), ymm(R5) ) \
+    VEXTRACTF32X8( imm(0x01), zmm(R2), ymm2 ) \
+    VADDPS( ymm2, ymm(R2), ymm(R6) ) \
+    VEXTRACTF32X8( imm(0x01), zmm(R3), ymm3 ) \
+    VADDPS( ymm3, ymm(R3), ymm(R7) )
+
+#define ACCUM_YMM( R0, R1, R2, R3, R4 ) \
+    vhaddps( ymm(R1), ymm(R0), ymm0 ) \
+    vextractf128( imm(0x01), ymm0, xmm1 ) \
+    vaddps( xmm0, xmm1, xmm0 ) \
+    vhaddps( ymm(R3), ymm(R2), ymm2 ) \
+    vextractf128( imm(0x01), ymm2, xmm1 ) \
+    vaddps( xmm2, xmm1, xmm2 ) \
+    vhaddps( xmm2, xmm0, xmm(R4) )
+
+#define ALPHA_SCALE \
+    mov( var(alpha), rax ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vmulps( xmm0, xmm4, xmm4 ) \
+    vmulps( xmm0, xmm5, xmm5 ) \
+    vmulps( xmm0, xmm6, xmm6 )
+
+#define ALPHA_SCALE2 \
+    mov( var(alpha), rax ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vmulps( xmm0, xmm4, xmm4 ) \
+    vmulps( xmm0, xmm5, xmm5 )
+
+#define ALPHA_SCALE1 \
+    mov( var(alpha), rax ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vmulps( xmm0, xmm4, xmm4 )
+
+#define C_STOR \
+    mov( var( rs_c ), rdi ) \
+	lea( mem( , rdi, 4 ), rdi ) \
+    mov( var(beta), rax ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vfmadd231ps( (rcx), xmm0, xmm4 ) \
+    vmovups( xmm4, (rcx) ) \
+    add( rdi, rcx ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vfmadd231ps( (rcx), xmm0, xmm5 ) \
+    vmovups( xmm5, (rcx) ) \
+    add( rdi, rcx ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vfmadd231ps( (rcx), xmm0, xmm6 ) \
+    vmovups( xmm6, (rcx) ) \
+    add( rdi, rcx )
+
+#define C_STOR2 \
+    mov( var( rs_c ), rdi ) \
+	lea( mem( , rdi, 4 ), rdi ) \
+    mov( var(beta), rax ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vfmadd231ps( (rcx), xmm0, xmm4 ) \
+    vmovups( xmm4, (rcx) ) \
+    add( rdi, rcx ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vfmadd231ps( (rcx), xmm0, xmm5 ) \
+    vmovups( xmm5, (rcx) ) \
+    add( rdi, rcx )
+
+#define C_STOR1 \
+    mov( var( rs_c ), rdi ) \
+	lea( mem( , rdi, 4 ), rdi ) \
+    mov( var(beta), rax ) \
+    vbroadcastss( (rax), xmm0 ) \
+    vfmadd231ps( (rcx), xmm0, xmm4 ) \
+    vmovups( xmm4, (rcx) ) \
+    add( rdi, rcx )
+
+#define C_STOR_BZ \
+    mov( var( rs_c ), rdi ) \
+	lea( mem( , rdi, 4 ), rdi ) \
+    vmovups( xmm4, mem( rcx ) ) \
+    add( rdi, rcx ) \
+    vmovups( xmm5, mem( rcx ) ) \
+    add( rdi, rcx ) \
+    vmovups( xmm6, mem( rcx ) ) \
+    add( rdi, rcx )
+
+#define C_STOR_BZ2 \
+    mov( var( rs_c ), rdi ) \
+	lea( mem( , rdi, 4 ), rdi ) \
+    vmovups( xmm4, mem( rcx ) ) \
+    add( rdi, rcx ) \
+    vmovups( xmm5, mem( rcx ) ) \
+    add( rdi, rcx )
+
+#define C_STOR_BZ1 \
+    mov( var( rs_c ), rdi ) \
+	lea( mem( , rdi, 4 ), rdi ) \
+    vmovups( xmm4, mem( rcx ) ) \
+    add( rdi, rcx )
diff --git a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c
new file mode 100644
index 0000000000..1e0ce1c4c4
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64m.c
@@ -0,0 +1,1729 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+#include "bli_gemmsup_rd_zen_s6x64.h"
+
+#define NR 64
+
+
+void bli_sgemmsup_rd_zen_asm_6x64m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t n_left = n0 % NR;      // n0 is expected to be n0<=NR
+
+    // First check whether this is a edge case in the n dimension.
+    // If so, dispatch other 6x?m kernels, as needed.
+    if ( n_left )
+    {
+        float* restrict cij = c;
+        float* restrict bj  = b;
+        float* restrict ai  = a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+
+            bli_sgemmsup_rd_zen_asm_6x48m_avx512
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+
+            bli_sgemmsup_rd_zen_asm_6x32m_avx512
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+
+            bli_sgemmsup_rd_zen_asm_6x16m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+
+            bli_sgemmsup_rd_zen_asm_6x8m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+
+            bli_sgemmsup_rd_zen_asm_6x4m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_6x2m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, m0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+        return;
+    }
+
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1 ), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea(mem(   , r15, 1 ), rsi)         // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+
+    mov( var( m_iter ), r11 )           // ii = m_iter;
+    label( .SLOOP3X4I )                 // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( r14 ), rax )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_a
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    vmovups( ( rax,  r8, 4 ), ymm4 )
+    vmovups( ( rax, rdi, 1 ), ymm5 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    vmovss( ( rax,  r8, 4 ), xmm4 )
+    vmovss( ( rax, rdi, 1 ), xmm5 )
+    add( imm( 1*4 ), rax )
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 1*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    // The horizontal sum of each ZMM register has the result for a single
+    // element of the C Matrix.
+    // ZMM_TO_YMM adds the upper half of ZMM registers to the lower half of
+    // the respective ZMM registers, thus having the result in the lower half of
+    // ZMM registers which is equivalent to its respective YMM counterpart.
+    // ymm = lo(zmm) + hi(zmm)
+    // zmm8 = z0 z1 z2 z3 z4 z5 z6 z7 z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm0 = z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm8 = z0 z1  z2  z3  z4  z5  z6  z7
+    // ymm0 = ymm0 + ymm8
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+
+    // Accumulates the results by horizontally adding the YMM registers,
+    // and having the final result in xmm registers.
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE     				// Scaling the result of A*B with alpha
+
+    C_STOR       					// Storing result to C
+
+    ZMM_TO_YMM( 20, 21, 22, 23,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 24, 25, 26, 27,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 28, 29, 30, 31, 12, 13, 14, 15 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE     				// Scaling the result of A*B with alpha
+
+    C_STOR       					// Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ ) 
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE     				// Scaling the result of A*B with alpha
+
+    C_STOR_BZ       				// Storing result to C
+
+    ZMM_TO_YMM( 20, 21, 22, 23,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 24, 25, 26, 27,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 28, 29, 30, 31, 12, 13, 14, 15 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE     				// Scaling the result of A*B with alpha
+
+    C_STOR_BZ       				// Storing result to C
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 6*rs_c
+
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 6*rs_a
+
+    dec( r11 )
+    jne( .SLOOP3X4I )                   // iterate again if ii != 0.
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 64 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25",
+      "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 64;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 5 == m_left )
+        {
+            dim_t mr_cur = 5;
+            bli_sgemmsup_rd_zen_asm_5x64_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }                
+        if ( 4 == m_left )
+        {
+            const dim_t mr_cur = 4;
+
+            bli_sgemmsup_rd_zen_asm_4x64_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 3 == m_left )
+        {
+            const dim_t mr_cur = 3;
+
+            bli_sgemmsup_rd_zen_asm_3x64_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x64_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x64_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_6x48m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm( 0 ), r15 )                // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(   , r15, 1 ), rsi )       // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+
+    mov( var( m_iter ), r11 )           // ii = m_iter;
+    label( .SLOOP3X4I )                 // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+    lea( mem( r14 ), rax )              // load c to rcx
+    lea( mem( r12 ), rcx )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+    
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+    
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    vmovups( ( rax,  r8, 4 ), ymm4 )
+    vmovups( ( rax, rdi, 1 ), ymm5 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    vmovss( ( rax,  r8, 4 ), xmm4 )
+    vmovss( ( rax, rdi, 1 ), xmm5 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    // The horizontal sum of each ZMM register has the result for a single
+    // element of the C Matrix.
+    // ZMM_TO_YMM adds the upper half of ZMM registers to the lower half of
+    // the respective ZMM registers, thus having the result in the lower half of
+    // ZMM registers which is equivalent to its respective YMM counterpart.
+    // ymm = lo(zmm) + hi(zmm)
+    // zmm8 = z0 z1 z2 z3 z4 z5 z6 z7 z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm0 = z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm8 = z0 z1  z2  z3  z4  z5  z6  z7
+    // ymm0 = ymm0 + ymm8
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    // Accumulates the results by horizontally adding the YMM registers,
+    // and having the final result in xmm registers.
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    ZMM_TO_YMM( 20, 21, 22, 23,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 24, 25, 26, 27,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 28, 29, 30, 31, 12, 13, 14, 15 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+    ZMM_TO_YMM( 20, 21, 22, 23,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 24, 25, 26, 27,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 28, 29, 30, 31, 12, 13, 14, 15 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 6*rs_c
+
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 6*rs_a
+
+    dec( r11 )
+    jne( .SLOOP3X4I )                    // iterate again if ii != 0.
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 48 ), r15 )
+    jl( .SLOOP3X4J )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25",
+      "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 48;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 5 == m_left )
+        {
+            dim_t mr_cur = 5;
+            bli_sgemmsup_rd_zen_asm_5x48_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }                
+        if ( 4 == m_left )
+        {
+            const dim_t mr_cur = 4;
+
+            bli_sgemmsup_rd_zen_asm_4x48_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 3 == m_left )
+        {
+            const dim_t mr_cur = 3;
+
+            bli_sgemmsup_rd_zen_asm_3x48_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x48_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x48_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_6x32m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+
+    mov( imm(0), r15 )                  // jj = 0;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ 0 1 ... ]
+
+    mov( var( abuf ), r14 )             // load address of a
+    mov( var( bbuf ), rdx )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    lea( mem( , r15, 1), rsi )
+    imul( imm( 1*4 ), rsi )
+    lea( mem( r12, rsi, 1 ), r12 )      // c += r15 * cs_c
+    
+    lea( mem(   , r15, 1 ), rsi )       // rsi = r15 = 4*jj;
+    imul( r9, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*jj*cs_b;
+
+
+    mov( var( m_iter ), r11 )           // ii = m_iter;
+    label( .SLOOP3X4I )                 // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+    lea( mem( r14 ), rax )              // load c to rcx
+    lea( mem( r12 ), rcx )              // load a to rax
+    lea( mem( rdx ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_b
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_b
+    
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var( k_iter8 ), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    vmovups( ( rax,  r8, 4 ), ymm4 )
+    vmovups( ( rax, rdi, 1 ), ymm5 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var( k_left1 ), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    vmovss( ( rax,  r8, 4 ), xmm4 )
+    vmovss( ( rax, rdi, 1 ), xmm5 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    // The horizontal sum of each ZMM register has the result for a single
+    // element of the C Matrix.
+    // ZMM_TO_YMM adds the upper half of ZMM registers to the lower half of
+    // the respective ZMM registers, thus having the result in the lower half of
+    // ZMM registers which is equivalent to its respective YMM counterpart.
+    // ymm = lo(zmm) + hi(zmm)
+    // zmm8 = z0 z1 z2 z3 z4 z5 z6 z7 z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm0 = z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm8 = z0 z1  z2  z3  z4  z5  z6  z7
+    // ymm0 = ymm0 + ymm8
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    // Accumulates the results by horizontally adding the YMM registers,
+    // and having the final result in xmm registers.
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    ZMM_TO_YMM( 20, 21, 22, 23,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 24, 25, 26, 27,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 28, 29, 30, 31, 12, 13, 14, 15 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+    ZMM_TO_YMM( 20, 21, 22, 23,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 24, 25, 26, 27,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 28, 29, 30, 31, 12, 13, 14, 15 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+    label( .SDONE )
+
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r12, rdi, 2 ), r12 )
+    lea( mem( r12, rdi, 4 ), r12 )      // c_ii = r12 += 6*rs_c
+
+    lea( mem( r14, r8,  2 ), r14 )
+    lea( mem( r14, r8,  4 ), r14 )      // a_ii = r14 += 6*rs_a
+
+    dec( r11 )
+    jne( .SLOOP3X4I )                    // iterate again if ii != 0.
+
+    add( imm(  4 ), r15 )
+    cmp( imm( 32 ), r15 )
+    jl( .SLOOP3X4J )
+
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [m_iter]   "m" (m_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25",
+      "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t      nr_cur = 32;
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict bj  = b;
+        float* restrict ai  = a + i_edge*rs_a;
+
+        if ( 5 == m_left )
+        {
+            dim_t mr_cur = 5;
+            bli_sgemmsup_rd_zen_asm_5x32_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }                
+        if ( 4 == m_left )
+        {
+            const dim_t mr_cur = 4;
+
+            bli_sgemmsup_rd_zen_asm_4x32_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 3 == m_left )
+        {
+            const dim_t mr_cur = 3;
+
+            bli_sgemmsup_rd_zen_asm_3x32_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 2 == m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x32_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+        if ( 1 == m_left )
+        {
+            const dim_t mr_cur = 1;
+
+            bli_sgemmsup_rd_zen_asm_1x32_avx512
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+        }
+    }
+}
diff --git a/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c
new file mode 100644
index 0000000000..145d3b5201
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_rd_zen_s6x64n.c
@@ -0,0 +1,1405 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+#include "bli_gemmsup_rd_zen_s6x64.h"
+
+#define MR 6
+
+void bli_sgemmsup_rd_zen_asm_6x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t m_left = m0 % 6;      // m0 is expected to be m0<=MR
+
+    // First check whether this is a edge case in the n dimension.
+    // If so, dispatch other ?x64n kernels, as needed.
+    if ( m_left )
+    {
+        float* restrict cij = c;
+        float* restrict bj  = b;
+        float* restrict ai  = a;
+
+        if ( 3 <= m_left )
+        {
+            const dim_t mr_cur = 3;
+
+            bli_sgemmsup_rd_zen_asm_3x64n_avx512
+            (
+              conja, conjb, mr_cur, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += mr_cur*rs_c0;
+            ai  += mr_cur*rs_a0;
+            m_left -= mr_cur;
+        }
+
+        if ( 2 <= m_left )
+        {
+            const dim_t mr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x64n_avx512
+            (
+              conja, conjb, mr_cur, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += mr_cur*rs_c0;
+            ai  += mr_cur*rs_a0;
+            m_left -= mr_cur;
+        }
+
+        if ( 1 == m_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_TRANSPOSE, conja, k0, n0,
+              alpha, bj, rs_b0, cs_b0, ai, cs_a0,
+              beta, cij, cs_c0, cntx, NULL
+            );
+        }
+        return;
+    }
+
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+    mov( imm( 0 ), r11 )                // ii = 0;
+    label( .SLOOP3X4I )                 // LOOP OVER ii = [ 0 1 ... ]
+
+    mov( var( abuf ), rdx )             // load address of a
+    mov( var( bbuf ), r14 )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+    mov( var( rs_c ), rdi )
+    lea( mem( , rdi, 4 ), rdi )
+
+    lea( mem( , r11, 1 ), rsi )
+    imul( rdi, rsi )
+    lea( mem( r12, rsi, 1 ), r12 )
+    
+    lea( mem(   , r11, 1 ), rsi)        // rsi = r11 = 4*ii;
+    imul( r8, rsi )                     // rsi *= cs_b;
+    lea( mem( rdx, rsi, 1 ), rdx )      // rbx = b + 4*ii*cs_b;
+
+
+    mov( var( n_iter ), r15 )           // jj = n_iter;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ n_iter ... 1 0 ]
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( rdx ), rax )              // load a to rax
+    lea( mem( r14 ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_a
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )         // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    vmovups( ( rax, r10, 1 ), zmm3 )
+    vmovups( ( rax,  r8, 4 ), zmm4 )
+    vmovups( ( rax, rdi, 1 ), zmm5 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var(k_iter8), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    vmovups( ( rax, r10, 1 ), ymm3 )
+    vmovups( ( rax,  r8, 4 ), ymm4 )
+    vmovups( ( rax, rdi, 1 ), ymm5 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var(k_left1), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    vmovss( ( rax, r10, 1 ), xmm3 )
+    vmovss( ( rax,  r8, 4 ), xmm4 )
+    vmovss( ( rax, rdi, 1 ), xmm5 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA6(  8,  9, 10, 20, 21, 22 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA6( 11, 12, 13, 23, 24, 25 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA6( 14, 15, 16, 26, 27, 28 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA6( 17, 18, 19, 29, 30, 31 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    // The horizontal sum of each ZMM register has the result for a single
+    // element of the C Matrix.
+    // ZMM_TO_YMM adds the upper half of ZMM registers to the lower half of
+    // the respective ZMM registers, thus having the result in the lower half of
+    // ZMM registers which is equivalent to its respective YMM counterpart.
+    // ymm = lo(zmm) + hi(zmm)
+    // zmm8 = z0 z1 z2 z3 z4 z5 z6 z7 z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm0 = z8 z9 z10 z11 z12 z13 z14 z15
+    // ymm8 = z0 z1  z2  z3  z4  z5  z6  z7
+    // ymm0 = ymm0 + ymm8
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    // Accumulates the results by horizontally adding the YMM registers,
+    // and having the final result in xmm registers.
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    ZMM_TO_YMM( 20, 21, 22, 23,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 24, 25, 26, 27,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 28, 29, 30, 31, 12, 13, 14, 15 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+    ZMM_TO_YMM( 20, 21, 22, 23,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 24, 25, 26, 27,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 28, 29, 30, 31, 12, 13, 14, 15 )
+
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+
+    label( .SDONE )
+
+    add( imm(4*4), r12 )
+    lea(mem(r14, r9,  4), r14)         // a_ii = r14 += 3*rs_a
+
+    dec( r15 )
+    jne( .SLOOP3X4J )                    // iterate again if ii != 0.
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [n_iter]   "m" (n_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "ymm20", "ymm21", "ymm22", "ymm23", "ymm24", "ymm25",
+      "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 6;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_6x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_3x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load cs_b
+    lea( mem( , r9, 4 ), r9 )           // cs_b *= sizeof(dt) => cs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+    mov( var( abuf ), rdx )             // load address of a
+    mov( var( bbuf ), r14 )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+
+    mov( var( n_iter ), r15 )           // jj = m_iter;
+    label( .SLOOP3X4J )                 // LOOP OVER jj = [ m_iter ... 1 0 ]
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( rdx ), rax )              // load a to rax
+    lea( mem( r14 ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_a
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3( 8, 9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3( 8, 9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3( 8, 9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3( 8, 9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3( 8, 9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    vmovups( ( rax,  r8, 2 ), zmm2 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA3( 8, 9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var(k_iter8), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    vmovups( ( rax,  r8, 2 ), ymm2 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA3( 8, 9, 10 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var(k_left1), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    vmovss( ( rax,  r8, 2 ), xmm2 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA3( 8, 9, 10 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA3( 11, 12, 13 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA3( 14, 15, 16 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA3( 17, 18, 19 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+   
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR                          // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+    ZMM_TO_YMM(  8,  9, 10, 11,  4,  5,  6,  7 )
+    ZMM_TO_YMM( 12, 13, 14, 15,  8,  9, 10, 11 )
+    ZMM_TO_YMM( 16, 17, 18, 19, 12, 13, 14, 15 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+    ACCUM_YMM( 6, 9, 12, 15, 6 )
+
+    ALPHA_SCALE                     // Scaling the result of A*B with alpha
+
+    C_STOR_BZ                       // Storing result to C
+
+    label( .SDONE )
+
+    add( imm( 4*4 ), r12 )
+
+    lea( mem( r14, r9,  4 ), r14)   // a_ii = r14 += 3*rs_a
+
+    dec( r15 )
+    jne( .SLOOP3X4J )               // iterate again if jj != 0.
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [n_iter]   "m" (n_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm16", "ymm17", "ymm18", "ymm19",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 3;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_3x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
+
+void bli_sgemmsup_rd_zen_asm_2x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter64 = k0 / 64;
+    uint64_t k_left64 = k0 % 64;
+    uint64_t k_iter32 = k_left64 / 32;
+    uint64_t k_left32 = k_left64 % 32;
+    uint64_t k_iter8  = k_left32 / 8;
+    uint64_t k_left1  = k_left32 % 8;
+
+    uint64_t n_iter = n0 / 4;
+    uint64_t n_left = n0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )              // load rs_a
+    lea( mem( , r8, 4 ), r8 )           // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( cs_b ), r9 )              // load rs_b
+    lea( mem( , r9, 4 ), r9 )           // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )             // load cs_a
+    lea( mem( , r10, 4 ), r10 )         // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r9, r9, 2 ), r13 )        // r13 = 3 * rs_b
+
+    mov( var( abuf ), rdx )             // load address of a
+    mov( var( bbuf ), r14 )             // load address of b
+    mov( var( cbuf ), r12 )             // load address of c
+
+
+    mov( var( n_iter ), r15 )           // jj = m_iter;
+    label( .SLOOP3X4J )                 // LOOP OVER ii = [ m_iter ... 1 0 ]
+
+    lea( mem( r12 ), rcx )              // load c to rcx
+    lea( mem( rdx ), rax )              // load a to rax
+    lea( mem( r14 ), rbx )              // load b to rbx
+    
+    lea( mem(  r8, r8, 2 ), r10 )       // r10 = 3 * rs_a
+    lea( mem( r10, r8, 2 ), rdi )       // rdi = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( k_iter64 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_32 )
+
+
+    label( .K_LOOP_ITER64 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 2
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 3
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER64 )
+
+
+    label( .CONSIDER_K_ITER_32 )
+
+    mov( var( k_iter32 ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSIDER_K_ITER_8 )
+
+
+    label( .K_LOOP_ITER32 )
+
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    // ITER 1
+    vmovups(         ( rax ), zmm0 )
+    vmovups( ( rax,  r8, 1 ), zmm1 )
+    add( imm( 16*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), zmm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), zmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), zmm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), zmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 16*4 ), rbx )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER32 )
+
+    
+    label( .CONSIDER_K_ITER_8 )
+    mov( var(k_iter8), rsi )
+    test( rsi, rsi )
+    je( .CONSIDER_K_LEFT_1 )
+
+
+    label( .K_LOOP_ITER8 )
+    // ITER 0
+    // load row from A
+    vmovups(         ( rax ), ymm0 )
+    vmovups( ( rax,  r8, 1 ), ymm1 )
+    add( imm( 8*4 ), rax )
+
+    // load column from B
+    vmovups(        ( rbx ), ymm6 )
+    VFMA2( 8, 9 )
+
+    vmovups( ( rbx, r9, 1 ), ymm6 )
+    VFMA2( 11, 12 )
+    
+    vmovups( ( rbx, r9, 2 ), ymm6 )
+    VFMA2( 14, 15 )
+
+    vmovups( ( rbx, r13, 1 ), ymm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 8*4 ), rbx )    
+
+    dec( rsi )
+    jne( .K_LOOP_ITER8 )
+
+
+    label( .CONSIDER_K_LEFT_1 )
+    mov( var(k_left1), rsi )
+    test( rsi, rsi )
+    je( .POST_ACCUM )
+
+
+    label( .K_LOOP_LEFT1 )
+    
+    vmovss(         ( rax ), xmm0 )
+    vmovss( ( rax,  r8, 1 ), xmm1 )
+    add( imm( 1*4 ), rax )                 // a += 1*cs_b = 1*4;
+
+    vmovss(        ( rbx ), xmm6 )
+    VFMA2( 8, 9 )
+
+    vmovss( ( rbx, r9, 1 ), xmm6 )
+    VFMA2( 11, 12 )
+    
+    vmovss( ( rbx, r9, 2 ), xmm6 )
+    VFMA2( 14, 15 )
+
+    vmovss( ( rbx, r13, 1 ), xmm6 )
+    VFMA2( 17, 18 )
+
+    add( imm( 1*4 ), rbx )                 // b += 1*rs_b = 1*4;
+
+    dec( rsi )
+    jne( .K_LOOP_LEFT1 )
+
+
+    label( .POST_ACCUM )
+
+    mov( var( beta ), rax )         // load address of beta
+    vbroadcastss( ( rax ), xmm0 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm0 )          // check if beta = 0
+    je( .POST_ACCUM_STOR_BZ )
+
+    
+    // Accumulating & storing the results when beta != 0
+    label( .POST_ACCUM_STOR )
+
+    ZMM_TO_YMM(  8,  9, 11, 12,  4,  5,  7,  8 )
+    ZMM_TO_YMM( 14, 15, 17, 18, 10, 11, 13, 14 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+
+    ALPHA_SCALE2                    // Scaling the result of A*B with alpha
+
+    C_STOR2                         // Storing result to C
+
+    jmp( .SDONE )
+
+
+    // Accumulating & storing the results when beta == 0
+    label( .POST_ACCUM_STOR_BZ )
+
+
+    ZMM_TO_YMM(  8,  9, 11, 12,  4,  5,  7,  8 )
+    ZMM_TO_YMM( 14, 15, 17, 18, 10, 11, 13, 14 )
+    
+    ACCUM_YMM( 4, 7, 10, 13, 4 )
+    ACCUM_YMM( 5, 8, 11, 14, 5 )
+
+    ALPHA_SCALE2                    // Scaling the result of A*B with alpha
+
+    C_STOR_BZ2                      // Storing result to C
+
+
+    label( .SDONE )
+
+    add( imm(4*4), r12 )
+    lea(mem(r14, r9,  4), r14)      // a_ii = r14 += 3*rs_a
+
+    dec( r15 )
+    jne( .SLOOP3X4J )               // iterate again if jj != 0.
+
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter64] "m" (k_iter64),
+      [k_left64] "m" (k_left64),
+      [k_iter32] "m" (k_iter32),
+      [k_left32] "m" (k_left32),
+      [k_iter8]  "m" (k_iter8),
+      [k_left1]  "m" (k_left1),
+      [a]        "m" (a),
+      [rs_a]     "m" (rs_a),
+      [cs_a]     "m" (cs_a),
+      [b]        "m" (b),
+      [rs_b]     "m" (rs_b),
+      [cs_b]     "m" (cs_b),
+      [alpha]    "m" (alpha),
+      [beta]     "m" (beta),
+      [c]        "m" (c),
+      [rs_c]     "m" (rs_c),
+      [cs_c]     "m" (cs_c),
+      [n0]       "m" (n0),
+      [m0]       "m" (m0),
+      [n_iter]   "m" (n_iter),
+      [abuf]     "m" (abuf),
+      [bbuf]     "m" (bbuf),
+      [cbuf]     "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6",
+      "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
+      "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13",
+      "ymm14", "ymm15", "ymm17", "ymm18",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t      mr_cur = 2;
+        const dim_t      j_edge = n0 - ( dim_t )n_left;
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict ai  = a;
+        float* restrict bj  = b + j_edge*cs_b;
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+
+            bli_sgemmsup_rd_zen_asm_2x2
+            (
+              conja, conjb, mr_cur, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+        if ( 1 == n_left )
+        {
+            bli_sgemv_ex
+            (
+              BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+              beta, cij, rs_c0, cntx, NULL
+            );
+        }
+    }
+}
diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c
new file mode 100644
index 0000000000..a69d016b38
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.c
@@ -0,0 +1,1857 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+#include "bli_gemmsup_rv_zen_s6x64.h"
+
+void bli_sgemmsup_rv_zen_asm_5x48_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE3( 7,  8,  9, 10 )
+    ALPHA_SCALE3( 7, 12, 13, 14 )
+    ALPHA_SCALE3( 7, 16, 17, 18 )
+    ALPHA_SCALE3( 7, 20, 21, 22 )
+    ALPHA_SCALE3( 7, 24, 25, 26 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C3( 4,  8,  9, 10 )
+    UPDATE_C3( 4, 12, 13, 14 )
+    UPDATE_C3( 4, 16, 17, 18 )
+    UPDATE_C3( 4, 20, 21, 22 )
+    UPDATE_C3( 4, 24, 25, 26 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 4 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16( 24 )
+    UPDATE_C_1X16( 25 )
+    UPDATE_C_1X16( 26 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C3_BZ(  8,  9, 10 )
+    UPDATE_C3_BZ( 12, 13, 14 )
+    UPDATE_C3_BZ( 16, 17, 18 )
+    UPDATE_C3_BZ( 20, 21, 22 )
+    UPDATE_C3_BZ( 24, 25, 26 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 4 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 24 )
+    UPDATE_C_1X16_BZ( 25 )
+    UPDATE_C_1X16_BZ( 26 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_5x32_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE2( 7,  8,  9 )
+    ALPHA_SCALE2( 7, 12, 13 )
+    ALPHA_SCALE2( 7, 16, 17 )
+    ALPHA_SCALE2( 7, 20, 21 )
+    ALPHA_SCALE2( 7, 24, 25 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C2( 4,  8,  9 )
+    UPDATE_C2( 4, 12, 13 )
+    UPDATE_C2( 4, 16, 17 )
+    UPDATE_C2( 4, 20, 21 )
+    UPDATE_C2( 4, 24, 25 )
+    jmp(.SDONE)
+
+    label( .SCOLSTORED )
+
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 9, 13, 17, 21 )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 4 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16( 24 )
+    UPDATE_C_1X16( 25 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C2_BZ(  8,  9 )
+    UPDATE_C2_BZ( 12, 13 )
+    UPDATE_C2_BZ( 16, 17 )
+    UPDATE_C2_BZ( 20, 21 )
+    UPDATE_C2_BZ( 24, 25 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem(    , rdi, 4 ), rdi )      // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 4 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem(    , rdi, 4 ), rdi )      // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 24 )
+    UPDATE_C_1X16_BZ( 25 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_5x16_avx512
+    (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE1( 7,  8 )
+    ALPHA_SCALE1( 7, 12 )
+    ALPHA_SCALE1( 7, 16 )
+    ALPHA_SCALE1( 7, 20 )
+    ALPHA_SCALE1( 7, 24 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C1( 4,  8 )
+    UPDATE_C1( 4, 12 )
+    UPDATE_C1( 4, 16 )
+    UPDATE_C1( 4, 20 )
+    UPDATE_C1( 4, 24 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16(  8, 12, 16, 20 )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 4 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16( 24 )
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C1_BZ( 8 )
+    UPDATE_C1_BZ( 12 )
+    UPDATE_C1_BZ( 16 )
+    UPDATE_C1_BZ( 20 )
+    UPDATE_C1_BZ( 24 )
+    jmp(.SDONE)
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 4 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 24 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_3x48_avx512
+    (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE3( 7,  8,  9, 10 )
+    ALPHA_SCALE3( 7, 12, 13, 14 )
+    ALPHA_SCALE3( 7, 16, 17, 18 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C3( 4,  8,  9, 10 )
+    UPDATE_C3( 4, 12, 13, 14 )
+    UPDATE_C3( 4, 16, 17, 18 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 10, 14 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 2 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16( 16 )
+    UPDATE_C_1X16( 17 )
+    UPDATE_C_1X16( 18 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C3_BZ(  8,  9, 10 )
+    UPDATE_C3_BZ( 12, 13, 14 )
+    UPDATE_C3_BZ( 16, 17, 18 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 10, 14 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 2 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 16 )
+    UPDATE_C_1X16_BZ( 17 )
+    UPDATE_C_1X16_BZ( 18 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_3x32_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE2( 7,  8,  9 )
+    ALPHA_SCALE2( 7, 12, 13 )
+    ALPHA_SCALE2( 7, 16, 17 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C2( 4,  8,  9 )
+    UPDATE_C2( 4, 12, 13 )
+    UPDATE_C2( 4, 16, 17 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16( 8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 9, 13 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 2 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16( 16 )
+    UPDATE_C_1X16( 17 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C2_BZ(  8,  9 )
+    UPDATE_C2_BZ( 12, 13 )
+    UPDATE_C2_BZ( 16, 17 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ( 8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 9, 13 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 2 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 16 )
+    UPDATE_C_1X16_BZ( 17 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_3x16_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE1( 7,  8 )
+    ALPHA_SCALE1( 7, 12 )
+    ALPHA_SCALE1( 7, 16 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C1( 4,  8 )
+    UPDATE_C1( 4, 12 )
+    UPDATE_C1( 4, 16 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16( 8, 12 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 2 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16( 16 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C1_BZ(  8 )
+    UPDATE_C1_BZ( 12 )
+    UPDATE_C1_BZ( 16 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ( 8, 12 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rcx, rdi, 2 ), rcx )
+    mov( var( cs_c ), rdi )             // load cs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = cs_c *= sizeof(dt) => cs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 16 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.h b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.h
new file mode 100644
index 0000000000..ae5023c400
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64.h
@@ -0,0 +1,357 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define INIT_REG \
+    vxorps( zmm0,zmm0,zmm0 ) \
+    vxorps( zmm1,zmm1,zmm1 ) \
+    vxorps( zmm2,zmm2,zmm2 ) \
+    vxorps( zmm3,zmm3,zmm3 ) \
+    vxorps( zmm4,zmm4,zmm4 ) \
+    vxorps( zmm5,zmm5,zmm5 ) \
+    vxorps( zmm6,zmm6,zmm6 ) \
+    vxorps( zmm7,zmm7,zmm7 ) \
+    vxorps( zmm8,zmm8,zmm8 ) \
+    vxorps( zmm9,zmm9,zmm9 ) \
+    vxorps( zmm10,zmm10,zmm10 ) \
+    vxorps( zmm11,zmm11,zmm11 ) \
+    vxorps( zmm12,zmm12,zmm12 ) \
+    vxorps( zmm13,zmm13,zmm13 ) \
+    vxorps( zmm14,zmm14,zmm14 ) \
+    vxorps( zmm15,zmm15,zmm15 ) \
+    vxorps( zmm16,zmm16,zmm16 ) \
+    vxorps( zmm17,zmm17,zmm17 ) \
+    vxorps( zmm18,zmm18,zmm18 ) \
+    vxorps( zmm19,zmm19,zmm19 ) \
+    vxorps( zmm20,zmm20,zmm20 ) \
+    vxorps( zmm21,zmm21,zmm21 ) \
+    vxorps( zmm22,zmm22,zmm22 ) \
+    vxorps( zmm23,zmm23,zmm23 ) \
+    vxorps( zmm24,zmm24,zmm24 ) \
+    vxorps( zmm25,zmm25,zmm25 ) \
+    vxorps( zmm26,zmm26,zmm26 ) \
+    vxorps( zmm27,zmm27,zmm27 ) \
+    vxorps( zmm28,zmm28,zmm28 ) \
+    vxorps( zmm29,zmm29,zmm29 ) \
+    vxorps( zmm30,zmm30,zmm30 ) \
+    vxorps( zmm31,zmm31,zmm31 )
+
+/**
+ * VFMA4 - performs 4 VFMAs for k-loop
+ * zmm0-3 - contains 4 rows of B
+ * R0 - register containing A broadcast
+ * R1-4 - registers to store intermediate result
+ */
+#define VFMA4( R0, R1, R2, R3, R4) \
+    vfmadd231ps( zmm0,zmm(R0),zmm(R1) ) \
+    vfmadd231ps( zmm1,zmm(R0),zmm(R2) ) \
+    vfmadd231ps( zmm2,zmm(R0),zmm(R3) ) \
+    vfmadd231ps( zmm3,zmm(R0),zmm(R4) )
+
+#define VFMA3( R0, R1, R2, R3) \
+    vfmadd231ps( zmm0,zmm(R0),zmm(R1) ) \
+    vfmadd231ps( zmm1,zmm(R0),zmm(R2) ) \
+    vfmadd231ps( zmm2,zmm(R0),zmm(R3) )
+
+#define VFMA2( R0, R1, R2 ) \
+    vfmadd231ps( zmm0,zmm(R0),zmm(R1) ) \
+    vfmadd231ps( zmm1,zmm(R0),zmm(R2) )
+
+#define VFMA1( R0, R1 ) \
+    vfmadd231ps( zmm0,zmm(R0),zmm(R1) )
+
+/**
+ * ALPHA_SCALE4 - scales 4 ZMM registers by alpha
+ * R0 - register having alpha
+ * R1-4 - registers to be scaled
+ */
+#define ALPHA_SCALE4( R0, R1, R2, R3, R4 ) \
+    vmulps( zmm(R0), zmm(R1), zmm(R1) ) \
+    vmulps( zmm(R0), zmm(R2), zmm(R2) ) \
+    vmulps( zmm(R0), zmm(R3), zmm(R3) ) \
+    vmulps( zmm(R0), zmm(R4), zmm(R4) )
+
+#define ALPHA_SCALE3( R0, R1, R2, R3 ) \
+    vmulps( zmm(R0), zmm(R1), zmm(R1) ) \
+    vmulps( zmm(R0), zmm(R2), zmm(R2) ) \
+    vmulps( zmm(R0), zmm(R3), zmm(R3) )
+
+#define ALPHA_SCALE2( R0, R1, R2 ) \
+    vmulps( zmm(R0), zmm(R1), zmm(R1) ) \
+    vmulps( zmm(R0), zmm(R2), zmm(R2) )
+
+#define ALPHA_SCALE1( R0, R1 ) \
+    vmulps( zmm(R0), zmm(R1), zmm(R1) )
+
+/**
+ * UPDATE_C4 - loads 4 C rows, performs 4 VFMAs (scaling by beta), stores to buffer & increments C ptr
+ * R0 -> register having beta
+ * R1-4 -> registers having intermediate results ( alpha * A * B )
+ */
+#define UPDATE_C4( R0, R1, R2, R3, R4 ) \
+    vmovups( (rcx), zmm1 ) \
+    vfmadd231ps( zmm(R0),zmm1,zmm(R1) ) \
+    vmovups( zmm(R1),(rcx) ) \
+    vmovups( 0x40(rcx),zmm1 ) \
+    vfmadd231ps( zmm(R0),zmm1,zmm(R2) ) \
+    vmovups( zmm(R2),0x40(rcx) ) \
+    vmovups( 0x80(rcx),zmm1 ) \
+    vfmadd231ps( zmm(R0),zmm1,zmm(R3) ) \
+    vmovups( zmm(R3),0x80(rcx) ) \
+    vmovups( 0xc0(rcx),zmm1 ) \
+    vfmadd231ps( zmm(R0),zmm1,zmm(R4) ) \
+    vmovups( zmm(R4),0xc0(rcx) ) \
+    add( rdi, rcx )
+
+#define UPDATE_C3( R0, R1, R2, R3 ) \
+    vmovups( (rcx), zmm1 ) \
+    vfmadd231ps( zmm(R0),zmm1,zmm(R1) ) \
+    vmovups( zmm(R1),(rcx) ) \
+    vmovups( 0x40(rcx),zmm1 ) \
+    vfmadd231ps( zmm(R0),zmm1,zmm(R2) ) \
+    vmovups( zmm(R2),0x40(rcx) ) \
+    vmovups( 0x80(rcx),zmm1 ) \
+    vfmadd231ps( zmm(R0),zmm1,zmm(R3) ) \
+    vmovups( zmm(R3),0x80(rcx) ) \
+    add( rdi, rcx )
+
+#define UPDATE_C2( R0, R1, R2 ) \
+    vmovups( (rcx), zmm1 ) \
+    vfmadd231ps( zmm(R0), zmm1, zmm(R1) ) \
+    vmovups( zmm(R1), (rcx) ) \
+    vmovups( 0x40(rcx), zmm1 ) \
+    vfmadd231ps( zmm(R0), zmm1, zmm(R2) ) \
+    vmovups( zmm(R2), 0x40(rcx) ) \
+    add( rdi, rcx )
+
+#define UPDATE_C1( R0, R1 ) \
+    vmovups( (rcx), zmm1 ) \
+    vfmadd231ps( zmm(R0), zmm1, zmm(R1) ) \
+    vmovups( zmm(R1), (rcx) ) \
+    add( rdi, rcx )
+
+/**
+ * UPDATE_C4_BZ - stores result to buffer & increments C ptr
+ * R0-3 -> registers having intermediate results ( alpha * A * B )
+ */
+#define UPDATE_C4_BZ( R0, R1, R2, R3 ) \
+    vmovups( zmm(R0),(rcx) ) \
+    vmovups( zmm(R1),0x40(rcx) ) \
+    vmovups( zmm(R2),0x80(rcx) ) \
+    vmovups( zmm(R3),0xc0(rcx) ) \
+    add( rdi, rcx )
+
+#define UPDATE_C3_BZ( R0, R1, R2 ) \
+    vmovups( zmm(R0),(rcx) ) \
+    vmovups( zmm(R1),0x40(rcx) ) \
+    vmovups( zmm(R2),0x80(rcx) ) \
+    add( rdi, rcx )
+
+#define UPDATE_C2_BZ( R0, R1 ) \
+    vmovups( zmm(R0),(rcx) ) \
+    vmovups( zmm(R1),0x40(rcx) ) \
+    add( rdi, rcx )
+
+#define UPDATE_C1_BZ( R0 ) \
+    vmovups( zmm(R0),(rcx) ) \
+    add( rdi, rcx )
+
+#define TRANSPOSE_4X16( R0, R1, R2, R3 ) \
+    TRANSPOSE_4X16L( 0X44, R0, R1, R2, R3 ) \
+    TRANSPOSE_4X16L( 0XEE, R0, R1, R2, R3 ) \
+    TRANSPOSE_4X16H( 0X44, R0, R1, R2, R3 ) \
+    TRANSPOSE_4X16H( 0XEE, R0, R1, R2, R3 )
+
+#define TRANSPOSE_4X16L( IMM, R0, R1, R2, R3 ) \
+    vunpcklps( ZMM(R1), ZMM(R0), zmm6 ) \
+    vunpcklps( ZMM(R3), ZMM(R2), zmm7 ) \
+    VSHUFPS( imm(IMM), zmm7, zmm6, zmm5 ) \
+    VINSERTF32X4( imm(0x0), mem(rcx), zmm0, zmm0 ) \
+    VINSERTF32X4( imm(0x1), mem(rcx, rdi, 4), zmm0, zmm0 ) \
+    VINSERTF32X4( imm(0x2), mem(rcx, rdi, 8), zmm0, zmm0 ) \
+    VINSERTF32X4( imm(0x3), mem(rcx, r12, 4), zmm0, zmm0 ) \
+    VFMADD231PS( zmm0, zmm4, zmm5 ) \
+    VEXTRACTF32X4( imm(0x00), zmm5, mem(rcx) ) \
+    VEXTRACTF32X4( imm(0x01), zmm5, mem(rcx, rdi, 4) ) \
+    VEXTRACTF32X4( imm(0x02), zmm5, mem(rcx, rdi, 8) ) \
+    VEXTRACTF32X4( imm(0x03), zmm5, mem(rcx, r12, 4) ) \
+    add( rdi, rcx )
+
+#define TRANSPOSE_4X16H( IMM, R0, R1, R2, R3 ) \
+    vunpckhps( ZMM(R1), ZMM(R0), zmm6 ) \
+    vunpckhps( ZMM(R3), ZMM(R2), zmm7 ) \
+    VSHUFPS( imm(IMM), zmm7, zmm6, zmm5 ) \
+    VINSERTF32X4( imm(0x0), mem(rcx), zmm0, zmm0 ) \
+    VINSERTF32X4( imm(0x1), mem(rcx, rdi, 4), zmm0, zmm0 ) \
+    VINSERTF32X4( imm(0x2), mem(rcx, rdi, 8), zmm0, zmm0 ) \
+    VINSERTF32X4( imm(0x3), mem(rcx, r12, 4), zmm0, zmm0 ) \
+    VFMADD231PS( zmm0, zmm4, zmm5 ) \
+    VEXTRACTF32X4( imm(0x00), zmm5, mem(rcx) ) \
+    VEXTRACTF32X4( imm(0x01), zmm5, mem(rcx, rdi, 4) ) \
+    VEXTRACTF32X4( imm(0x02), zmm5, mem(rcx, rdi, 8) ) \
+    VEXTRACTF32X4( imm(0x03), zmm5, mem(rcx, r12, 4) ) \
+    add( rdi, rcx )
+
+#define TRANSPOSE_4X16_BZ( R0, R1, R2, R3 ) \
+    TRANSPOSE_4X16L_BZ( 0X44, R0, R1, R2, R3 ) \
+    TRANSPOSE_4X16L_BZ( 0XEE, R0, R1, R2, R3 ) \
+    TRANSPOSE_4X16H_BZ( 0X44, R0, R1, R2, R3 ) \
+    TRANSPOSE_4X16H_BZ( 0XEE, R0, R1, R2, R3 )
+
+#define TRANSPOSE_4X16L_BZ( IMM, R0, R1, R2, R3 ) \
+    vunpcklps( ZMM(R1), ZMM(R0), zmm6 ) \
+    vunpcklps( ZMM(R3), ZMM(R2), zmm7 ) \
+    VSHUFPS( imm(IMM), zmm7, zmm6, zmm5 ) \
+    VEXTRACTF32X4( imm(0x00), zmm5, mem(rcx) ) \
+    VEXTRACTF32X4( imm(0x01), zmm5, mem(rcx, rdi, 4) ) \
+    VEXTRACTF32X4( imm(0x02), zmm5, mem(rcx, rdi, 8) ) \
+    VEXTRACTF32X4( imm(0x03), zmm5, mem(rcx, r12, 4) ) \
+    add( rdi, rcx )
+
+#define TRANSPOSE_4X16H_BZ( IMM, R0, R1, R2, R3 ) \
+    vunpckhps( ZMM(R1), ZMM(R0), zmm6 ) \
+    vunpckhps( ZMM(R3), ZMM(R2), zmm7 ) \
+    VSHUFPS( imm(IMM), zmm7, zmm6, zmm5 ) \
+    VEXTRACTF32X4( imm(0x00), zmm5, mem(rcx) ) \
+    VEXTRACTF32X4( imm(0x01), zmm5, mem(rcx, rdi, 4) ) \
+    VEXTRACTF32X4( imm(0x02), zmm5, mem(rcx, rdi, 8) ) \
+    VEXTRACTF32X4( imm(0x03), zmm5, mem(rcx, r12, 4) ) \
+    add( rdi, rcx )
+
+#define TRANSPOSE_2X16( R0, R1 ) \
+    MOV( rcx, r12 ) \
+    TRANSPOSE_2X16L( R0, R1 ) \
+    lea( mem(r12, rdi, 2), rcx ) \
+    MOV( rcx, r12 ) \
+    TRANSPOSE_2X16H( R0, R1 )
+
+#define TRANSPOSE_2X16L( R0, R1 ) \
+    VUNPCKLPS( zmm(R1), zmm(R0), zmm5 ) \
+    FETCH_C_2X16 \
+    MOV( r12, rcx ) \
+    VFMADD231PS( zmm0, zmm4, zmm5 ) \
+    UPDATE_C_2X16( 5 )
+
+#define TRANSPOSE_2X16H( R0, R1 ) \
+    VUNPCKHPS( zmm(R1), zmm(R0), zmm5 ) \
+    FETCH_C_2X16 \
+    MOV( r12, rcx ) \
+    VFMADD231PS( zmm0, zmm4, zmm5 ) \
+    UPDATE_C_2X16( 5 )
+
+#define FETCH_C_2X16 \
+    VMOVLPD( mem(rcx), xmm0, xmm0 ) \
+    VMOVHPD( mem(rcx, rdi, 1), xmm0, xmm0 ) \
+    LEA( mem(rcx, rdi, 4), rcx ) \
+    VMOVLPD( mem(rcx), xmm1, xmm1 ) \
+    VMOVHPD( mem(rcx, rdi, 1), xmm1, xmm1 ) \
+    VINSERTF32X4( imm(0x1), xmm1, zmm0, zmm0 ) \
+    LEA( mem(rcx, rdi, 4), rcx ) \
+    VMOVLPD( mem(rcx), xmm1, xmm1 ) \
+    VMOVHPD( mem(rcx, rdi, 1), xmm1, xmm1 ) \
+    VINSERTF32X4( imm(0x2), xmm1, zmm0, zmm0 ) \
+    LEA( mem(rcx, rdi, 4), rcx ) \
+    VMOVLPD( mem(rcx), xmm1, xmm1 ) \
+    VMOVHPD( mem(rcx, rdi, 1), xmm1, xmm1 ) \
+    VINSERTF32X4( imm(0x3), xmm1, zmm0, zmm0 )
+
+#define TRANSPOSE_2X16_BZ( R0, R1 ) \
+    mov( rcx, r12 ) \
+    TRANSPOSE_2X16L_BZ( R0, R1 ) \
+    lea( mem(r12, rdi, 2), rcx ) \
+    TRANSPOSE_2X16H_BZ( R0, R1 )
+
+#define TRANSPOSE_2X16L_BZ( R0, R1 ) \
+    vunpcklps( zmm(R1), zmm(R0), zmm5 ) \
+    UPDATE_C_2X16( 5 ) \
+
+#define TRANSPOSE_2X16H_BZ( R0, R1 ) \
+    vunpckhps( zmm(R1), zmm(R0), zmm5 ) \
+    UPDATE_C_2X16( 5 )
+
+#define UPDATE_C_2X16( R0 ) \
+    VEXTRACTF32X4( imm(0x0), zmm(R0), xmm0 ) \
+    vmovlpd( xmm0, mem(rcx) ) \
+    vmovhpd( xmm0, mem(rcx, rdi, 1) ) \
+    lea( mem(rcx, rdi, 4), rcx ) \
+    VEXTRACTF32X4( imm(0x1), zmm(R0), xmm1 ) \
+    vmovlpd( xmm1, mem(rcx) ) \
+    vmovhpd( xmm1, mem(rcx, rdi, 1) ) \
+    VEXTRACTF32X4( imm(0x2), zmm(R0), xmm2 ) \
+    lea( mem(rcx, rdi, 4), rcx ) \
+    vmovlpd( xmm2, mem(rcx) ) \
+    vmovhpd( xmm2, mem(rcx, rdi, 1) ) \
+    VEXTRACTF32X4( imm(0x3), zmm(R0), xmm3 ) \
+    lea( mem(rcx, rdi, 4), rcx ) \
+    vmovlpd( xmm3, mem(rcx) ) \
+    vmovhpd( xmm3, mem(rcx, rdi, 1) )
+
+#define UPDATE_C_1X16_BZ(R0) \
+    UPDATE_C_1X16_BZ_UTIL( 0x00, R0 ) \
+    UPDATE_C_1X16_BZ_UTIL( 0x01, R0 ) \
+    UPDATE_C_1X16_BZ_UTIL( 0x02, R0 ) \
+    UPDATE_C_1X16_BZ_UTIL( 0x03, R0 )
+
+#define UPDATE_C_1X16_BZ_UTIL( IMM, R0 ) \
+    VEXTRACTF32X4( imm(IMM), zmm(R0), xmm0 ) \
+    vshufps( imm(0x01), xmm0, xmm0, xmm1 ) \
+    vshufps( imm(0x02), xmm0, xmm0, xmm2 ) \
+    vshufps( imm(0x03), xmm0, xmm0, xmm3 ) \
+    vmovss( xmm0, (rcx) ) \
+    vmovss( xmm1, (rcx, rdi, 1) ) \
+    vmovss( xmm2, (rcx, rdi, 2) ) \
+    vmovss( xmm3, (rcx, r12, 1) ) \
+    lea( (rcx, rdi, 4), rcx )
+
+#define UPDATE_C_1X16( R0 ) \
+    UPDATE_C_1X16_UTIL( 0x00, R0 ) \
+    UPDATE_C_1X16_UTIL( 0x01, R0 ) \
+    UPDATE_C_1X16_UTIL( 0x02, R0 ) \
+    UPDATE_C_1X16_UTIL( 0x03, R0 )
+
+#define UPDATE_C_1X16_UTIL( IMM, R0 ) \
+    VEXTRACTF32X4( imm(IMM), zmm(R0), xmm0 ) \
+    vshufps( imm(0x01), xmm0, xmm0, xmm6 ) \
+    vshufps( imm(0x02), xmm0, xmm0, xmm7 ) \
+    vshufps( imm(0x03), xmm0, xmm0, xmm12 ) \
+    vmovss( (rcx), xmm1 ) \
+    vmovss( (rcx, rdi, 1), xmm2 ) \
+    vmovss( (rcx, rdi, 2), xmm3 ) \
+    vmovss( (rcx, r12, 1), xmm5 ) \
+    vfmadd231ps( xmm1, xmm4, xmm0 ) \
+    vfmadd231ps( xmm2, xmm4, xmm6 ) \
+    vfmadd231ps( xmm3, xmm4, xmm7 ) \
+    vfmadd231ps( xmm5, xmm4, xmm12 ) \
+    vmovss( xmm0, (rcx) ) \
+    vmovss( xmm6, (rcx, rdi, 1) ) \
+    vmovss( xmm7, (rcx, rdi, 2) ) \
+    vmovss( xmm12, (rcx, r12, 1) ) \
+    lea( (rcx, rdi, 4), rcx )
diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c
new file mode 100644
index 0000000000..23f43052e8
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64m.c
@@ -0,0 +1,5248 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+#include "bli_gemmsup_rv_zen_s6x64.h"
+
+#define NR 64
+
+/*
+   rrr:
+     --------        ------        --------
+     --------        ------        --------
+     --------   +=   ------ ...    --------
+     --------        ------        --------
+     --------        ------            :
+     --------        ------            :
+   Assumptions:
+   - B is row-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR (6) and NR (64), respectively.
+   Therefore, this (r)ow-preferential kernel is well-suited for contiguous
+   (v)ector loads on B and single-element broadcasts from A.
+
+   NOTE: These kernels currently do not have in-register transpose 
+   implemented and hence they do not support column-oriented IO.
+*/
+void bli_sgemmsup_rv_zen_asm_6x64m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t n_left = n0 % NR;      // n0 is expected to be n0<=NR
+
+    // First check whether this is a edge case in the n dimension.
+    // If so, dispatch other 6x?m kernels, as needed.
+    if ( n_left )
+    {
+        float* cij = c;
+        float* bj  = b;
+        float* ai  = a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+            bli_sgemmsup_rv_zen_asm_6x48m_avx512
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0, beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += nr_cur * cs_c0;
+            bj  += nr_cur * cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+            bli_sgemmsup_rv_zen_asm_6x32m_avx512
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0, beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += nr_cur * cs_c0;
+            bj  += nr_cur * cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+            bli_sgemmsup_rv_zen_asm_6x16m_avx512
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0, beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += nr_cur * cs_c0;
+            bj  += nr_cur * cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+            bli_sgemmsup_rv_zen_asm_6x8m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0, beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += nr_cur * cs_c0;
+            bj  += nr_cur * cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_6x4m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0, beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += nr_cur * cs_c0;
+            bj  += nr_cur * cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_6x2m
+            (
+              conja, conjb, m0, nr_cur, k0,
+              alpha, ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0, beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += nr_cur * cs_c0;
+            bj  += nr_cur * cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 1 <= n_left )
+        {
+            const dim_t nr_cur = 1;
+            dim_t ps_a0 = bli_auxinfo_ps_a( data );
+            if ( ps_a0 == 6 * rs_a0 )
+            {
+                bli_sgemv_ex
+                (
+                  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+                  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+                  beta, cij, rs_c0, cntx, NULL
+                );
+            }
+            else
+            {
+                const dim_t mr = 6;
+        
+                // Since A is packed into row panels,
+                // we must use a loop over gemv.
+                dim_t m_iter = ( m0 + mr - 1 ) / mr;
+                dim_t m_left =   m0            % mr;
+        
+                float* restrict ai_ii  = ai;
+                float* restrict cij_ii = cij;
+        
+                for ( dim_t ii = 0; ii < m_iter; ii += 1 )
+                {
+                    dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
+                                     ? mr : m_left );
+                
+                    bli_sgemv_ex 
+                    (
+                      BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+                      alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
+                      beta, cij_ii, rs_c0, cntx, NULL
+                    );
+                    cij_ii += mr_cur * rs_c0;
+                    ai_ii += ps_a0;
+                } 
+            }
+            n_left -= nr_cur;
+        }
+
+        if ( n0 / NR == 0 )
+        {
+            return;
+        }
+    }
+
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( m_iter ), r11 )       // load m_iter
+
+
+    label( .M_LOOP_ITER )
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    // C Prefetch
+    cmp( imm( 4 ), rdi )
+    jz( .SPOSTPFETCH )  // haven't added col-prefetch cases
+
+
+    label( .SROWPFETCH )
+    lea( mem( rcx, rdi, 2 ), rdx )
+    lea( mem( rdx, rdi, 1 ), rdx )
+
+    prefetch( 0, mem( rcx,         7*8 ) )
+    prefetch( 0, mem( rcx, rdi, 1, 7*8 ) )
+    prefetch( 0, mem( rcx, rdi, 2, 7*8 ) )
+    prefetch( 0, mem( rdx,         7*8 ) )
+    prefetch( 0, mem( rdx, rdi, 1, 7*8 ) )
+    prefetch( 0, mem( rdx, rdi, 2, 7*8 ) )
+
+
+    label( .SPOSTPFETCH )
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11)
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11)
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11)
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11)
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )              // i = k_left;
+	  test( rsi, rsi )                     // check i via logical AND.
+	  je( .SPOSTACCUM )                    // if i == 0, we're done; jump to end.
+	                                   // else, we prepare to enter k_left loop.
+
+    label( .K_LEFT_LOOP )
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11)
+    vbroadcastss( mem( rax,  r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax,  r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax,  r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )                           // i -= 1;
+	  jne( .K_LEFT_LOOP )                   // iterate again if i != 0.
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+    ALPHA_SCALE4( 7, 12, 13, 14, 15 )
+    ALPHA_SCALE4( 7, 16, 17, 18, 19 )
+    ALPHA_SCALE4( 7, 20, 21, 22, 23 )
+    ALPHA_SCALE4( 7, 24, 25, 26, 27 )
+    ALPHA_SCALE4( 7, 28, 29, 30, 31 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+    UPDATE_C4( 4, 12, 13, 14, 15 )
+    UPDATE_C4( 4, 16, 17, 18, 19 )
+    UPDATE_C4( 4, 20, 21, 22, 23 )
+    UPDATE_C4( 4, 24, 25, 26, 27 )
+    UPDATE_C4( 4, 28, 29, 30, 31 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     * 
+     * |-----------------------------------|       |------------------|--------| 
+     * |        |        |        |        |       |                  |        | 
+     * |        |        |        |        |       |       16x4       |  16x2  | 
+     * |  4x16  |  4x16  |  4x16  |  4x16  |       |                  |        | 
+     * |        |        |        |        |       |------------------|--------| 
+     * |        |        |        |        |       |                  |        | 
+     * |-----------------------------------|  ->   |       16x4       |  16x2  | 
+     * |        |        |        |        |       |                  |        | 
+     * |  2x16  |  2x16  |  2x16  |  2x16  |       |------------------|--------| 
+     * |        |        |        |        |       |                  |        | 
+     * |-----------------------------------|       |       16x4       |  16x2  | 
+     *                                             |                  |        |
+     *                                             |------------------|--------|
+     *                                             |                  |        |
+     *                                             |       16x4       |  16x2  |
+     *                                             |                  |        |
+     *                                             |------------------|--------|
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 11, 15, 19, 23 )
+    add( rdi, rcx )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16( 24, 28 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 25, 29 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 26, 30 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 27, 31 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ( 8, 9, 10, 11 )
+    UPDATE_C4_BZ( 12, 13, 14, 15 )
+    UPDATE_C4_BZ( 16, 17, 18, 19 )
+    UPDATE_C4_BZ( 20, 21, 22, 23 )
+    UPDATE_C4_BZ( 24, 25, 26, 27 )
+    UPDATE_C4_BZ( 28, 29, 30, 31 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) 
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16_BZ( 24, 28 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 25, 29 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 26, 30 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 27, 31 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    lea( mem( , r8, 2 ), rdx )          // rdx = rs_a * 2
+    lea( mem( rdx, r8, 4 ), rdx )       // rdx = rs_a * 6
+    mov( var( abuf ), rax )             // load address of a
+    add( rdx, rax )                     // a += rs_a * 6(MR)
+    mov( rax, var( abuf ) )             // store updated a
+
+    mov( var( rs_c ), rdi )         
+    lea( mem(    , rdi, 4 ), rdi )      // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem(    , rdi, 2 ), rdx )      // rdx = rs_c * 2
+    lea( mem( rdx, rdi, 4 ), rdx )      // rdx = rdi * 4 => rdx = rs_c * 6
+    mov( var( cbuf ), rcx )             // load address of c
+    add( rdx, rcx )                     // c += rs_c * 6(MR)
+    mov( rcx, var( cbuf ) )             // store updated c
+
+    dec( r11 )
+    jne( .M_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [m_iter] "m" (m_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( m_left )
+    {
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge * rs_c;
+        float* restrict ai  = a + i_edge * rs_a;
+        float* restrict bj  = b;
+
+        if ( 4 <= m_left )
+        {
+            const dim_t mr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_4x64m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+    
+        if ( 2 <= m_left )
+        {
+            const dim_t mr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_2x64m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+
+        if ( 1 <= m_left )
+        {
+            const dim_t mr_cur = 1;
+            bli_sgemmsup_rv_zen_asm_1x64m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_6x48m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( m_iter ), r11 )       // load m_iter
+
+
+    label( .M_LOOP_ITER )
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    // C Prefetch
+    lea( mem( rcx, rdi, 2 ), rdx )
+    lea( mem( rdx, rdi, 1 ), rdx )
+
+    cmp( imm( 4 ), rdi )
+    jz( .SPOSTPFETCH )  // haven't added col-prefetch cases
+
+
+    label( .SROWPFETCH )
+    prefetch( 0, mem( rcx,         7*8 ) )
+    prefetch( 0, mem( rcx, rdi, 1, 7*8 ) )
+    prefetch( 0, mem( rcx, rdi, 2, 7*8 ) )
+    prefetch( 0, mem( rdx,         7*8 ) )
+    prefetch( 0, mem( rdx, rdi, 1, 7*8 ) )
+    prefetch( 0, mem( rdx, rdi, 2, 7*8 ) )
+
+
+    label( .SPOSTPFETCH )
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA3( 6, 28, 29, 30 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA3( 6, 28, 29, 30 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA3( 6, 28, 29, 30 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA3( 6, 28, 29, 30 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )              // i = k_left;
+	  test( rsi, rsi )                     // check i via logical AND.
+	  je( .SPOSTACCUM )                    // if i == 0, we're done; jump to end.
+	                                   // else, we prepare to enter k_left loop.
+
+    label( .K_LEFT_LOOP )
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA3( 5, 24, 25, 26 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA3( 6, 28, 29, 30 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label(.SPOSTACCUM)	
+    // Scaling A * B with alpha.
+    ALPHA_SCALE3( 7, 8, 9, 10 )
+    ALPHA_SCALE3( 7, 12, 13, 14 )
+    ALPHA_SCALE3( 7, 16, 17, 18 )
+    ALPHA_SCALE3( 7, 20, 21, 22 )
+    ALPHA_SCALE3( 7, 24, 25, 26 )
+    ALPHA_SCALE3( 7, 28, 29, 30 )
+
+    mov( var( beta ), rdx )     // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )      // check if beta = 0
+    je( .SBETAZERO )            // jump to beta = 0 case
+
+    cmp( imm( 4 ), rdi )        // set ZF of (4*rs_c) == 4.
+    jz( .SCOLSTORED )           // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C3( 4, 8, 9, 10 )
+    UPDATE_C3( 4, 12, 13, 14 )
+    UPDATE_C3( 4, 16, 17, 18 )
+    UPDATE_C3( 4, 20, 21, 22 )
+    UPDATE_C3( 4, 24, 25, 26 )
+    UPDATE_C3( 4, 28, 29, 30 )
+
+    jmp( .SDONE )               // jump to the end
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16( 24, 28 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 25, 29 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 26, 30 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C3_BZ( 8, 9, 10 )
+    UPDATE_C3_BZ( 12, 13, 14 )
+    UPDATE_C3_BZ( 16, 17, 18 )
+    UPDATE_C3_BZ( 20, 21, 22 )
+    UPDATE_C3_BZ( 24, 25, 26 )
+    UPDATE_C3_BZ( 28, 29, 30 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 10, 14, 18, 22 )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16_BZ( 24, 28 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 25, 29 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 26, 30 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    lea( mem( , r8, 2 ), rdx )          // rdx = rs_a * 2
+    lea( mem( rdx, r8, 4 ), rdx )       // rdx = rs_a * 6
+    mov( var( abuf ), rax )             // load address of a
+    add( rdx, rax )                     // a += rs_a * 6(MR)
+    mov( rax, var( abuf ) )             // store updated a
+
+    mov( var( rs_c ), rdi )         
+    lea( mem(    , rdi, 4 ), rdi )      // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem(    , rdi, 2 ), rdx )      // rdx = rs_c * 2
+    lea( mem( rdx, rdi, 4 ), rdx )      // rdx = rdi * 4 => rdx = rs_c * 6
+    mov( var( cbuf ), rcx )             // load address of c
+    add( rdx, rcx )                     // c += rs_c * 6(MR)
+    mov( rcx, var( cbuf ) )             // store updated c
+
+    dec( r11 )
+    jne( .M_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [m_iter] "m" (m_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict ai  = a + i_edge*rs_a;
+        float* restrict bj  = b;
+
+        if ( 4 <= m_left )
+        {
+            const dim_t mr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_4x48m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+    
+        if ( 2 <= m_left )
+        {
+            const dim_t mr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_2x48m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+
+        if ( 1 <= m_left )
+        {
+            const dim_t mr_cur = 1;
+            bli_sgemmsup_rv_zen_asm_1x48m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_6x32m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( m_iter ), r11 )       // load m_iter
+
+
+    label( .M_LOOP_ITER )
+
+    INIT_REG
+
+    mov( var(abuf), rax )   // load address of a
+    mov( var(bbuf), rbx )   // load address of b
+    mov( var(cbuf), rcx )   // load address of c
+
+    // C Prefetch
+    lea( mem( rcx, rdi, 2 ), rdx )
+    lea( mem( rdx, rdi, 1 ), rdx )
+
+    cmp( imm( 4 ), rdi )
+    jz( .SPOSTPFETCH )  // haven't added col-prefetch cases
+
+
+    label( .SROWPFETCH )
+    prefetch( 0, mem( rcx,         7*8 ) )
+    prefetch( 0, mem( rcx, rdi, 1, 7*8 ) )
+    prefetch( 0, mem( rcx, rdi, 2, 7*8 ) )
+    prefetch( 0, mem( rdx,         7*8 ) )
+    prefetch( 0, mem( rdx, rdi, 1, 7*8 ) )
+    prefetch( 0, mem( rdx, rdi, 2, 7*8 ) )
+
+
+    label( .SPOSTPFETCH )
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )   // load k_iter
+    test( rsi,rsi )
+    je( .CONSID_K_LEFT )
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA2( 6, 28, 29 )
+
+    add( r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA2( 6, 28, 29 )
+
+    add( r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA2( 6, 28, 29 )
+
+    add( r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA2( 6, 28, 29 )
+
+    add( r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )              // i = k_left;
+	  test( rsi, rsi )                     // check i via logical AND.
+	  je( .SPOSTACCUM )                    // if i == 0, we're done; jump to end.
+	                                   // else, we prepare to enter k_left loop.
+
+    label( .K_LEFT_LOOP )
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA2( 5, 24, 25 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA2( 6, 28, 29 )
+
+    add( r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+    label(.SPOSTACCUM)
+    // Scaling A * B with alpha.
+    ALPHA_SCALE2( 7, 8, 9 )
+    ALPHA_SCALE2( 7, 12, 13 )
+    ALPHA_SCALE2( 7, 16, 17 )
+    ALPHA_SCALE2( 7, 20, 21 )
+    ALPHA_SCALE2( 7, 24, 25 )
+    ALPHA_SCALE2( 7, 28, 29 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C2( 4, 8, 9 )
+    UPDATE_C2( 4, 12, 13 )
+    UPDATE_C2( 4, 16, 17 )
+    UPDATE_C2( 4, 20, 21 )
+    UPDATE_C2( 4, 24, 25 )
+    UPDATE_C2( 4, 28, 29 )
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    TRANSPOSE_4X16( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16( 24, 28 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 25, 29 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label(.SBETAZERO)
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C2_BZ( 8, 9 )
+    UPDATE_C2_BZ( 12, 13 )
+    UPDATE_C2_BZ( 16, 17 )
+    UPDATE_C2_BZ( 20, 21 )
+    UPDATE_C2_BZ( 24, 25 )
+    UPDATE_C2_BZ( 28, 29 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label(.SCOLSTORBZ)
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 9, 13, 17, 21 )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16_BZ( 24, 28 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 25, 29 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    lea( mem( , r8, 2 ), rdx )          // rdx = rs_a * 2
+    lea( mem( rdx, r8, 4 ), rdx )       // rdx = rs_a * 6
+    mov( var( abuf ), rax )             // load address of a
+    add( rdx, rax )                     // a += rs_a * 6(MR)
+    mov( rax, var( abuf ) )             // store updated a
+
+    mov( var( rs_c ), rdi )         
+    lea( mem(    , rdi, 4 ), rdi )      // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem(    , rdi, 2 ), rdx )      // rdx = rs_c * 2
+    lea( mem( rdx, rdi, 4 ), rdx )      // rdx = rdi * 4 => rdx = rs_c * 6
+    mov( var( cbuf ), rcx )             // load address of c
+    add( rdx, rcx )                     // c += rs_c * 6(MR)
+    mov( rcx, var( cbuf ) )             // store updated c
+
+    dec( r11 )
+    jne( .M_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [m_iter] "m" (m_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict ai  = a + i_edge*rs_a;
+        float* restrict bj  = b;
+
+        if ( 4 <= m_left )
+        {
+            const dim_t mr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_4x32m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+    
+        if ( 2 <= m_left )
+        {
+            const dim_t mr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_2x32m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+
+        if ( 1 <= m_left )
+        {
+            const dim_t mr_cur = 1;
+            bli_sgemmsup_rv_zen_asm_1x32m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_6x16m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( m_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( m_iter ), r11 )       // load m_iter
+
+
+    label( .M_LOOP_ITER )
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    // C Prefetch
+    lea( mem( rcx, rdi, 2 ), rdx )
+    lea( mem( rdx, rdi, 1 ), rdx )
+
+    cmp( imm( 4 ), rdi )
+    jz( .SPOSTPFETCH )  // haven't added col-prefetch cases
+
+
+    label( .SROWPFETCH )
+    prefetch( 0, mem( rcx,         7*8 ) )
+    prefetch( 0, mem( rcx, rdi, 1, 7*8 ) )
+    prefetch( 0, mem( rcx, rdi, 2, 7*8 ) )
+    prefetch( 0, mem( rdx,         7*8 ) )
+    prefetch( 0, mem( rdx, rdi, 1, 7*8 ) )
+    prefetch( 0, mem( rdx, rdi, 2, 7*8 ) )
+
+
+    label( .SPOSTPFETCH )
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load a row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA1( 6, 28 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load a row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA1( 6, 28 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load a row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA1( 6, 28 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load a row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA1( 6, 28 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )              // i = k_left;
+	  test( rsi, rsi )                     // check i via logical AND.
+	  je( .SPOSTACCUM )                    // if i == 0, we're done; jump to end.
+	                                   // else, we prepare to enter k_left loop.
+
+    label( .K_LEFT_LOOP )
+    // Load a row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA1( 5, 24 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA1( 6, 28 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+    label(.SPOSTACCUM)
+    // Scaling A * B with alpha.
+    ALPHA_SCALE1( 7, 8 )
+    ALPHA_SCALE1( 7, 12 )
+    ALPHA_SCALE1( 7, 16 )
+    ALPHA_SCALE1( 7, 20 )
+    ALPHA_SCALE1( 7, 24 )
+    ALPHA_SCALE1( 7, 28 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C1( 4, 8 )
+    UPDATE_C1( 4, 12 )
+    UPDATE_C1( 4, 16 )
+    UPDATE_C1( 4, 20 )
+    UPDATE_C1( 4, 24 )
+    UPDATE_C1( 4, 28 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16( 24, 28 )
+
+    jmp( .SDONE )                       // jump to the end
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C1_BZ( 8 )
+    UPDATE_C1_BZ( 12 )
+    UPDATE_C1_BZ( 16 )
+    UPDATE_C1_BZ( 20 )
+    UPDATE_C1_BZ( 24 )
+    UPDATE_C1_BZ( 28 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16_BZ( 24, 28 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    lea( mem( , r8, 2 ), rdx )          // rdx = rs_a * 2
+    lea( mem( rdx, r8, 4 ), rdx )       // rdx = rs_a * 6
+    mov( var( abuf ), rax )             // load address of a
+    add( rdx, rax )                     // a += rs_a * 6(MR)
+    mov( rax, var( abuf ) )             // store updated a
+
+    mov( var( rs_c ), rdi )         
+    lea( mem(    , rdi, 4 ), rdi )      // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem(    , rdi, 2 ), rdx )      // rdx = rs_c * 2
+    lea( mem( rdx, rdi, 4 ), rdx )      // rdx = rdi * 4 => rdx = rs_c * 6
+    mov( var( cbuf ), rcx )             // load address of c
+    add( rdx, rcx )                     // c += rs_c * 6(MR)
+    mov( rcx, var( cbuf ) )             // store updated c
+
+    dec( r11 )
+    jne( .M_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [m_iter] "m" (m_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if (m_left)
+    {
+        const dim_t i_edge = m0 - ( dim_t )m_left;
+
+        float* restrict cij = c + i_edge*rs_c;
+        float* restrict ai  = a + i_edge*rs_a;
+        float* restrict bj  = b;
+
+        if ( 4 <= m_left )
+        {
+            const dim_t mr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_4x16m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+
+        if ( 2 <= m_left )
+        {
+            const dim_t mr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_2x16m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+
+        if ( 1 <= m_left )
+        {
+            const dim_t mr_cur = 1;
+            bli_sgemmsup_rv_zen_asm_1x16m_avx512
+            (
+              conja, conjb, mr_cur, n0, k0, alpha,
+              ai, rs_a0, cs_a0,
+              bj, rs_b0, cs_b0,
+              beta,
+              cij, rs_c0, cs_c0,
+              data, cntx
+            );
+            cij += mr_cur * rs_c;
+            ai  += mr_cur * rs_a;
+            m_left -= mr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_4x64m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+	                                // else, we prepare to enter k_left loop.
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15)
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+    ALPHA_SCALE4( 7, 12, 13, 14, 15 )
+    ALPHA_SCALE4( 7, 16, 17, 18, 19 )
+    ALPHA_SCALE4( 7, 20, 21, 22, 23 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+    UPDATE_C4( 4, 12, 13, 14, 15 )
+    UPDATE_C4( 4, 16, 17, 18, 19 )
+    UPDATE_C4( 4, 20, 21, 22, 23 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 11, 15, 19, 23 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ(  8,  9, 10, 11 )
+    UPDATE_C4_BZ( 12, 13, 14, 15 )
+    UPDATE_C4_BZ( 16, 17, 18, 19 )
+    UPDATE_C4_BZ( 20, 21, 22, 23 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) 
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm1", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_4x48m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA3( 6, 16, 17, 18 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA3( 4, 20, 21, 22 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE3( 7,  8,  9, 10 )
+    ALPHA_SCALE3( 7, 12, 13, 14 )
+    ALPHA_SCALE3( 7, 16, 17, 18 )
+    ALPHA_SCALE3( 7, 20, 21, 22 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C3( 4, 8, 9, 10 )
+    UPDATE_C3( 4, 12, 13, 14 )
+    UPDATE_C3( 4, 16, 17, 18 )
+    UPDATE_C3( 4, 20, 21, 22 )
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C3_BZ( 8, 9, 10 )
+    UPDATE_C3_BZ( 12, 13, 14 )
+    UPDATE_C3_BZ( 16, 17, 18 )
+    UPDATE_C3_BZ( 20, 21, 22 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 10, 14, 18, 22 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm1", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_4x32m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA2( 6, 16, 17 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA2( 4, 20, 21 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE2( 7,  8,  9 )
+    ALPHA_SCALE2( 7, 12, 13 )
+    ALPHA_SCALE2( 7, 16, 17 )
+    ALPHA_SCALE2( 7, 20, 21 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C2( 4, 8, 9 )
+    UPDATE_C2( 4, 12, 13 )
+    UPDATE_C2( 4, 16, 17 )
+    UPDATE_C2( 4, 20, 21 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C2_BZ( 8, 9 )
+    UPDATE_C2_BZ( 12, 13 )
+    UPDATE_C2_BZ( 16, 17 )
+    UPDATE_C2_BZ( 20, 21 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 9, 13, 17, 21 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm1", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_4x16m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA1( 6, 16 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA1( 4, 20 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE1( 7,  8 )
+    ALPHA_SCALE1( 7, 12 )
+    ALPHA_SCALE1( 7, 16 )
+    ALPHA_SCALE1( 7, 20 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C1( 4,  8 )
+    UPDATE_C1( 4, 12 )
+    UPDATE_C1( 4, 16 )
+    UPDATE_C1( 4, 20 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16( 8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C1_BZ( 8 )
+    UPDATE_C1_BZ( 12 )
+    UPDATE_C1_BZ( 16 )
+    UPDATE_C1_BZ( 20 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ( 8, 12, 16, 20 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm1", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_2x64m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4,  8,  9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+    ALPHA_SCALE4( 7, 12, 13, 14, 15 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+    UPDATE_C4( 4, 12, 13, 14, 15 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 10, 14 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 11, 15 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ(  8,  9, 10, 11 )
+    UPDATE_C4_BZ( 12, 13, 14, 15 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 10, 14 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 11, 15 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_2x48m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4,  8,  9, 10 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA3( 5, 12, 13, 14 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE3( 7,  8,  9, 10 )
+    ALPHA_SCALE3( 7, 12, 13, 14 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C3( 4,  8,  9, 10 )
+    UPDATE_C3( 4, 12, 13, 14 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 10, 14 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C3_BZ(  8, 9, 10 )
+    UPDATE_C3_BZ( 12, 13, 14 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 10, 14 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_2x32m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4,  8,  9 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA2( 5, 12, 13 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE2( 7,  8,  9 )
+    ALPHA_SCALE2( 7, 12, 13 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C2( 4,  8,  9 )
+    UPDATE_C2( 4, 12, 13 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16( 8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 9, 13 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C2_BZ(  8,  9 )
+    UPDATE_C2_BZ( 12, 13 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ( 8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 9, 13 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_2x16m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 2 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4,  8 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA1( 5, 12 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE1( 7,  8 )
+    ALPHA_SCALE1( 7, 12 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C1( 4,  8 )
+    UPDATE_C1( 4, 12 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16( 8, 12 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C1_BZ(  8 )
+    UPDATE_C1_BZ( 12 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ( 8, 12 )
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_1x64m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7, 8, 9, 10, 11 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4, 8, 9, 10, 11 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    UPDATE_C_1X16(  8 )
+    UPDATE_C_1X16(  9 )
+    UPDATE_C_1X16( 10 )
+    UPDATE_C_1X16( 11 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ( 8, 9, 10, 11 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ(  8 )
+    UPDATE_C_1X16_BZ(  9 )
+    UPDATE_C_1X16_BZ( 10 )
+    UPDATE_C_1X16_BZ( 11 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_1x48m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+    label( .K_LEFT_LOOP )
+
+    // Load 3 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA3( 4, 8, 9, 10 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE3( 7, 8, 9, 10 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C3( 4, 8, 9, 10 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    UPDATE_C_1X16(  8 )
+    UPDATE_C_1X16(  9 )
+    UPDATE_C_1X16( 10 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C3_BZ( 8, 9, 10 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ(  8 )
+    UPDATE_C_1X16_BZ(  9 )
+    UPDATE_C_1X16_BZ( 10 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_1x32m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 2 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA2( 4, 8, 9 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE2( 7, 8, 9 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C2( 4, 8, 9 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    UPDATE_C_1X16( 8 )
+    UPDATE_C_1X16( 9 )
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C2_BZ( 8, 9 )
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 8 )
+    UPDATE_C_1X16_BZ( 9 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
+
+void bli_sgemmsup_rv_zen_asm_1x16m_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float) => rs_c *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 1 row from B matrix.
+    vmovups( ( rbx ), zmm0 )
+
+    // Broadcast 1 element from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA1( 4, 8 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE1( 7, 8 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C1( 4, 8 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    UPDATE_C_1X16( 8 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C1_BZ( 8 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 8 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+}
diff --git a/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c
new file mode 100644
index 0000000000..e4ce3d1490
--- /dev/null
+++ b/kernels/zen4/3/sup/bli_gemmsup_rv_zen_s6x64n.c
@@ -0,0 +1,3067 @@
+/*
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "blis.h"
+
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+
+#include "bli_gemmsup_rv_zen_s6x64.h"
+
+#define MR 6
+
+/*
+   rrr:
+     --------        ------        --------  
+     --------        ------        --------  
+     --------   +=   ------ ...    --------  
+     --------        ------        --------  
+     --------        ------            : 
+     --------        ------            : 
+   Assumptions:
+   - B is row-stored;
+   - A is row-stored;
+   - m0 and n0 are at most MR (6) and NR (64), respectively.
+   Therefore, this (r)ow-preferential kernel is well-suited for contiguous
+   (v)ector loads on B and single-element broadcasts from A.
+
+   NOTE: These kernels currently do not have in-register transpose 
+   implemented and hence they do not support column-oriented IO.
+*/
+void bli_sgemmsup_rv_zen_asm_6x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t m_left = m0 % MR;      // m0 is expected to be m0<=MR
+
+    if ( m_left ) {
+        float* restrict cij = c;
+        float* restrict bj  = b;
+        float* restrict ai  = a;
+
+        if ( 5 <= m_left ) {
+            bli_sgemmsup_rv_zen_asm_5x64n_avx512(
+              conja, conjb, m_left, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            return;
+        }
+
+        if ( 4 <= m_left ) {
+            bli_sgemmsup_rv_zen_asm_4x64n_avx512(
+              conja, conjb, m_left, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            return;
+        }
+
+        if ( 3 <= m_left ) {
+            bli_sgemmsup_rv_zen_asm_3x64n_avx512(
+              conja, conjb, m_left, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            return;
+        }
+
+        if ( 2 <= m_left ) {
+            bli_sgemmsup_rv_zen_asm_2x64n_avx512(
+              conja, conjb, m_left, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            return;
+        }
+
+        if ( 1 <= m_left ) {
+            bli_sgemmsup_rv_zen_asm_1x64n_avx512(
+              conja, conjb, m_left, n0, k0,
+              alpha, ai, rs_a0, cs_a0, bj, rs_b0, cs_b0,
+              beta, cij, rs_c0, cs_c0, data, cntx
+            );
+            return;
+        }
+    }
+
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t n_iter = n0 / 64;
+    uint64_t n_left = n0 % 64;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( n_iter ), r11 )       // load n_iter
+
+    label( .N_LOOP_ITER )
+
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float)
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 6 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+    vbroadcastss( mem( rax, r15, 1 ), zmm6 )
+    VFMA4( 6, 28, 29, 30, 31 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+    ALPHA_SCALE4( 7, 12, 13, 14, 15 )
+    ALPHA_SCALE4( 7, 16, 17, 18, 19 )
+    ALPHA_SCALE4( 7, 20, 21, 22, 23 )
+    ALPHA_SCALE4( 7, 24, 25, 26, 27 )
+    ALPHA_SCALE4( 7, 28, 29, 30, 31 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+    UPDATE_C4( 4, 12, 13, 14, 15 )
+    UPDATE_C4( 4, 16, 17, 18, 19 )
+    UPDATE_C4( 4, 20, 21, 22, 23 )
+    UPDATE_C4( 4, 24, 25, 26, 27 )
+    UPDATE_C4( 4, 28, 29, 30, 31 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 11, 15, 19, 23 )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16( 24, 28 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 25, 29 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 26, 30 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 27, 31 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ(  8,  9, 10, 11 )
+    UPDATE_C4_BZ( 12, 13, 14, 15 )
+    UPDATE_C4_BZ( 16, 17, 18, 19 )
+    UPDATE_C4_BZ( 20, 21, 22, 23 )
+    UPDATE_C4_BZ( 24, 25, 26, 27 )
+    UPDATE_C4_BZ( 28, 29, 30, 31 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) 
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    lea( mem( rcx, r10, 4 ), rcx )
+    TRANSPOSE_2X16_BZ( 24, 28 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 25, 29 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 26, 30 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 27, 31 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    mov( var( cs_b ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_b * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx += cs_b * 8  => rdx = cs_b * 16
+    mov( var( bbuf ), rbx )
+    add( rdx, rbx )
+    mov( rbx, var( bbuf ) )
+
+    mov( var( cs_c ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_c * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64
+    mov( var( cbuf ), rcx )              // load address of c
+    add( rdx, rcx )                    // c += rs_c * MR
+    mov( rcx, var( cbuf ) )              // store updated c
+
+    dec( r11 )
+    jne( .N_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [n_iter] "m" (n_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t mr_cur = 6;
+        const dim_t j_edge = n0 - ( dim_t )n_left;
+
+        uint64_t ps_b   = bli_auxinfo_ps_b( data );
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict bj  = b + n_iter * ps_b;
+        float* restrict ai  = a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+            bli_sgemmsup_rv_zen_asm_6x48m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+            bli_sgemmsup_rv_zen_asm_6x32m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+            bli_sgemmsup_rv_zen_asm_6x16m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+            bli_sgemmsup_rv_zen_asm_6x8m
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_6x4m
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_6x2m
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 1 <= n_left )
+        {
+            const dim_t nr_cur = 1;
+            dim_t ps_a0 = bli_auxinfo_ps_a( data );
+            if ( ps_a0 == 6 * rs_a0 )
+            {
+                bli_sgemv_ex
+                (
+                  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+                  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+                  beta, cij, rs_c0, cntx, NULL
+                );
+            }
+            else
+            {
+                const dim_t mr = 6;
+
+                // Since A is packed into row panels, we must use a loop over
+                // gemv.
+                dim_t m_iter = ( m0 + mr - 1 ) / mr;
+                dim_t m_left =   m0            % mr;
+
+                float* restrict ai_ii  = ai;
+                float* restrict cij_ii = cij;
+
+                for ( dim_t ii = 0; ii < m_iter; ii += 1 )
+                {
+                    dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
+                                     ? mr : m_left );
+
+                    bli_sgemv_ex 
+                    (
+                      BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+                      alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
+                      beta, cij_ii, rs_c0, cntx, NULL
+                    );
+                    cij_ii += mr_cur*rs_c0;
+                    ai_ii  += ps_a0;
+                } 
+            }
+            n_left -= nr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_5x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t n_iter = n0 / 64;
+    uint64_t n_left = n0 % 64;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( n_iter ), r11 )       // load n_iter
+
+    label( .N_LOOP_ITER )
+
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float)
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+  
+    add( r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+  
+    add( r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+  
+    add( r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+  
+    add( r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 5 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+    vbroadcastss( mem( rax, r8, 4 ), zmm5 )
+    VFMA4( 5, 24, 25, 26, 27 )
+  
+    add( r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+    ALPHA_SCALE4( 7, 12, 13, 14, 15 )
+    ALPHA_SCALE4( 7, 16, 17, 18, 19 )
+    ALPHA_SCALE4( 7, 20, 21, 22, 23 )
+    ALPHA_SCALE4( 7, 24, 25, 26, 27 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+    UPDATE_C4( 4, 12, 13, 14, 15 )
+    UPDATE_C4( 4, 16, 17, 18, 19 )
+    UPDATE_C4( 4, 20, 21, 22, 23 )
+    UPDATE_C4( 4, 24, 25, 26, 27 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 11, 15, 19, 23 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )
+    lea( mem( , rdi, 4 ), rdi )
+    lea( mem( rcx, rdi, 4 ), rcx )
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16( 24 )
+    UPDATE_C_1X16( 25 )
+    UPDATE_C_1X16( 26 )
+    UPDATE_C_1X16( 27 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ(  8,  9, 10, 11 )
+    UPDATE_C4_BZ( 12, 13, 14, 15 )
+    UPDATE_C4_BZ( 16, 17, 18, 19 )
+    UPDATE_C4_BZ( 20, 21, 22, 23 )
+    UPDATE_C4_BZ( 24, 25, 26, 27 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) 
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( rs_c ), rdi )
+    lea( mem( , rdi, 4 ), rdi )
+    lea( mem( rcx, rdi, 4 ), rcx )
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 24 )
+    UPDATE_C_1X16_BZ( 25 )
+    UPDATE_C_1X16_BZ( 26 )
+    UPDATE_C_1X16_BZ( 27 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    mov( var( cs_b ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_b * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx += cs_b * 8  => rdx = cs_b * 16
+    mov( var( bbuf ), rbx )
+    add( rdx, rbx )
+    mov( rbx, var( bbuf ) )
+
+    mov( var( cs_c ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_c * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64
+    mov( var( cbuf ), rcx )              // load address of c
+    add( rdx, rcx )                    // c += rs_c * MR
+    mov( rcx, var( cbuf ) )              // store updated c
+
+    dec( r11 )
+    jne( .N_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [n_iter] "m" (n_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t mr_cur = 5;
+        const dim_t j_edge = n0 - ( dim_t )n_left;
+
+        uint64_t ps_b   = bli_auxinfo_ps_b( data );
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict bj  = b + n_iter * ps_b;
+        float* restrict ai  = a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+            bli_sgemmsup_rv_zen_asm_5x48_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+            bli_sgemmsup_rv_zen_asm_5x32_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+            bli_sgemmsup_rv_zen_asm_5x16_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+            bli_sgemmsup_rv_zen_asm_5x8
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_5x4
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_5x2
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 1 <= n_left )
+        {
+            const dim_t nr_cur = 1;
+            dim_t ps_a0 = bli_auxinfo_ps_a( data );
+            if ( ps_a0 == 5 * rs_a0 )
+            {
+                bli_sgemv_ex
+                (
+                  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+                  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+                  beta, cij, rs_c0, cntx, NULL
+                );
+            }
+            else
+            {
+                const dim_t mr = 5;
+
+                // Since A is packed into row panels, we must use a loop over
+                // gemv.
+                dim_t m_iter = ( m0 + mr - 1 ) / mr;
+                dim_t m_left =   m0            % mr;
+
+                float* restrict ai_ii  = ai;
+                float* restrict cij_ii = cij;
+
+                for ( dim_t ii = 0; ii < m_iter; ii += 1 )
+                {
+                    dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
+                                     ? mr : m_left );
+
+                    bli_sgemv_ex 
+                    (
+                      BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+                      alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
+                      beta, cij_ii, rs_c0, cntx, NULL
+                    );
+                    cij_ii += mr_cur*rs_c0;
+                    ai_ii  += ps_a0;
+                } 
+            }
+            n_left -= nr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_4x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t n_iter = n0 / 64;
+    uint64_t n_left = n0 % 64;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( n_iter ), r11 )       // load n_iter
+
+    label( .N_LOOP_ITER )
+
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float)
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 4 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+    vbroadcastss( mem( rax, r13, 1 ), zmm4 )
+    VFMA4( 4, 20, 21, 22, 23 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+    ALPHA_SCALE4( 7, 12, 13, 14, 15 )
+    ALPHA_SCALE4( 7, 16, 17, 18, 19 )
+    ALPHA_SCALE4( 7, 20, 21, 22, 23 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+    UPDATE_C4( 4, 12, 13, 14, 15 )
+    UPDATE_C4( 4, 16, 17, 18, 19 )
+    UPDATE_C4( 4, 20, 21, 22, 23 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_4X16(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16( 11, 15, 19, 23 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ(  8,  9, 10, 11 )
+    UPDATE_C4_BZ( 12, 13, 14, 15 )
+    UPDATE_C4_BZ( 16, 17, 18, 19 )
+    UPDATE_C4_BZ( 20, 21, 22, 23 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /**
+     * 6x64 tile is split into 4 equal 6x16 tiles.
+     * Each of these 6x16 tiles is further split into two tiles of
+     * 4x16 & 2x16 each.
+     * These smaller 4x16 & 2x16 tiles are transposed to 16x4 & 16x2 tiles,
+     * to get the transpose of 6x64 tile and are stored as 64x6 tile.
+     */
+    /* Transposing 4x16 tiles to 16x4 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_4X16_BZ(  8, 12, 16, 20 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ(  9, 13, 17, 21 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 10, 14, 18, 22 )
+    lea( mem( rcx, r12, 4 ), rcx )
+    TRANSPOSE_4X16_BZ( 11, 15, 19, 23 ) 
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    mov( var( cs_b ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_b * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx += cs_b * 8  => rdx = cs_b * 16
+    mov( var( bbuf ), rbx )
+    add( rdx, rbx )
+    mov( rbx, var( bbuf ) )
+
+    mov( var( cs_c ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_c * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64
+    mov( var( cbuf ), rcx )              // load address of c
+    add( rdx, rcx )                    // c += rs_c * MR
+    mov( rcx, var( cbuf ) )              // store updated c
+
+    dec( r11 )
+    jne( .N_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [n_iter] "m" (n_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm1", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t mr_cur = 4;
+        const dim_t j_edge = n0 - ( dim_t )n_left;
+
+        uint64_t ps_b   = bli_auxinfo_ps_b( data );
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict bj  = b + n_iter * ps_b;
+        float* restrict ai  = a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+            bli_sgemmsup_rv_zen_asm_4x48m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+            bli_sgemmsup_rv_zen_asm_4x32m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+            bli_sgemmsup_rv_zen_asm_4x16m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+            bli_sgemmsup_rv_zen_asm_4x8
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_4x4
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_4x2
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 1 <= n_left )
+        {
+            const dim_t nr_cur = 1;
+            dim_t ps_a0 = bli_auxinfo_ps_a( data );
+            if ( ps_a0 == 4 * rs_a0 )
+            {
+                bli_sgemv_ex
+                (
+                  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+                  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+                  beta, cij, rs_c0, cntx, NULL
+                );
+            }
+            else
+            {
+                const dim_t mr = 4;
+
+                // Since A is packed into row panels, we must use a loop over
+                // gemv.
+                dim_t m_iter = ( m0 + mr - 1 ) / mr;
+                dim_t m_left =   m0            % mr;
+
+                float* restrict ai_ii  = ai;
+                float* restrict cij_ii = cij;
+
+                for ( dim_t ii = 0; ii < m_iter; ii += 1 )
+                {
+                    dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
+                                     ? mr : m_left );
+
+                    bli_sgemv_ex 
+                    (
+                      BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+                      alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
+                      beta, cij_ii, rs_c0, cntx, NULL
+                    );
+                    cij_ii += mr_cur*rs_c0;
+                    ai_ii  += ps_a0;
+                } 
+            }
+            n_left -= nr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_3x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t n_iter = n0 / 64;
+    uint64_t n_left = n0 % 64;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( n_iter ), r11 )       // load n_iter
+
+    label( .N_LOOP_ITER )
+
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float)
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+    vbroadcastss( mem( rax, r8, 2 ), zmm6 )
+    VFMA4( 6, 16, 17, 18, 19 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+    ALPHA_SCALE4( 7, 12, 13, 14, 15 )
+    ALPHA_SCALE4( 7, 16, 17, 18, 19 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+    UPDATE_C4( 4, 12, 13, 14, 15 )
+    UPDATE_C4( 4, 16, 17, 18, 19 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 10, 14 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 11, 15 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )
+    mov( var( rs_c ), rdi )
+    lea( mem( , rdi, 4 ), rdi )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    mov( var( cs_c ), rdi )                // load rs_c
+    lea( mem( , rdi, 4 ), rdi )            // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16( 16 )
+    UPDATE_C_1X16( 17 )
+    UPDATE_C_1X16( 18 )
+    UPDATE_C_1X16( 19 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ(  8,  9, 10, 11 )
+    UPDATE_C4_BZ( 12, 13, 14, 15 )
+    UPDATE_C4_BZ( 16, 17, 18, 19 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 10, 14 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 11, 15 )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )
+    mov( var( rs_c ), rdi )
+    lea( mem( , rdi, 4 ), rdi )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    mov( var( cs_c ), rdi )               // load rs_c
+    lea( mem( , rdi, 4 ), rdi )           // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ( 16 )
+    UPDATE_C_1X16_BZ( 17 )
+    UPDATE_C_1X16_BZ( 18 )
+    UPDATE_C_1X16_BZ( 19 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    mov( var( cs_b ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_b * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx += cs_b * 8  => rdx = cs_b * 16
+    mov( var( bbuf ), rbx )
+    add( rdx, rbx )
+    mov( rbx, var( bbuf ) )
+
+    mov( var( cs_c ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_c * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64
+    mov( var( cbuf ), rcx )              // load address of c
+    add( rdx, rcx )                    // c += rs_c * MR
+    mov( rcx, var( cbuf ) )              // store updated c
+
+    dec( r11 )
+    jne( .N_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [n_iter] "m" (n_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t mr_cur = 3;
+        const dim_t j_edge = n0 - ( dim_t )n_left;
+
+        uint64_t ps_b   = bli_auxinfo_ps_b( data );
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict bj  = b + n_iter * ps_b;
+        float* restrict ai  = a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+            bli_sgemmsup_rv_zen_asm_3x48_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+            bli_sgemmsup_rv_zen_asm_3x32_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+            bli_sgemmsup_rv_zen_asm_3x16_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+            bli_sgemmsup_rv_zen_asm_3x8
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_3x4
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_3x2
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 1 <= n_left )
+        {
+            const dim_t nr_cur = 1;
+            dim_t ps_a0 = bli_auxinfo_ps_a( data );
+            if ( ps_a0 == 3 * rs_a0 )
+            {
+                bli_sgemv_ex
+                (
+                  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+                  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+                  beta, cij, rs_c0, cntx, NULL
+                );
+            }
+            else
+            {
+                const dim_t mr = 3;
+
+                // Since A is packed into row panels, we must use a loop over
+                // gemv.
+                dim_t m_iter = ( m0 + mr - 1 ) / mr;
+                dim_t m_left =   m0            % mr;
+
+                float* restrict ai_ii  = ai;
+                float* restrict cij_ii = cij;
+
+                for ( dim_t ii = 0; ii < m_iter; ii += 1 )
+                {
+                    dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
+                                     ? mr : m_left );
+
+                    bli_sgemv_ex 
+                    (
+                      BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+                      alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
+                      beta, cij_ii, rs_c0, cntx, NULL
+                    );
+                    cij_ii += mr_cur*rs_c0;
+                    ai_ii  += ps_a0;
+                } 
+            }
+            n_left -= nr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_2x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t n_iter = n0 / 64;
+    uint64_t n_left = n0 % 64;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( n_iter ), r11 )       // load n_iter
+
+    label( .N_LOOP_ITER )
+
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float)
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+    vbroadcastss( mem( rax, r8, 1 ), zmm5 )
+    VFMA4( 5, 12, 13, 14, 15 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+    ALPHA_SCALE4( 7, 12, 13, 14, 15 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+    UPDATE_C4( 4, 12, 13, 14, 15 )
+
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    TRANSPOSE_2X16(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 10, 14 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16( 11, 15 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ(  8,  9, 10, 11 )
+    UPDATE_C4_BZ( 12, 13, 14, 15 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    TRANSPOSE_2X16_BZ(  8, 12 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ(  9, 13 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 10, 14 )
+    lea( mem( rcx, rdi, 2 ), rcx )
+    TRANSPOSE_2X16_BZ( 11, 15 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    mov( var( cs_b ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_b * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx += cs_b * 8  => rdx = cs_b * 16
+    mov( var( bbuf ), rbx )
+    add( rdx, rbx )
+    mov( rbx, var( bbuf ) )
+
+    mov( var( cs_c ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_c * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64
+    mov( var( cbuf ), rcx )              // load address of c
+    add( rdx, rcx )                    // c += rs_c * MR
+    mov( rcx, var( cbuf ) )              // store updated c
+
+    dec( r11 )
+    jne( .N_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [n_iter] "m" (n_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t mr_cur = 2;
+        const dim_t j_edge = n0 - ( dim_t )n_left;
+
+        uint64_t ps_b   = bli_auxinfo_ps_b( data );
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict bj  = b + n_iter * ps_b;
+        float* restrict ai  = a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+            bli_sgemmsup_rv_zen_asm_2x48m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+            bli_sgemmsup_rv_zen_asm_2x32m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+            bli_sgemmsup_rv_zen_asm_2x16m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+            bli_sgemmsup_rv_zen_asm_2x8
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_2x4
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_2x2
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 1 <= n_left )
+        {
+            const dim_t nr_cur = 1;
+            dim_t ps_a0 = bli_auxinfo_ps_a( data );
+            if ( ps_a0 == 2 * rs_a0 )
+            {
+                bli_sgemv_ex
+                (
+                  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+                  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+                  beta, cij, rs_c0, cntx, NULL
+                );
+            }
+            else
+            {
+                const dim_t mr = 2;
+
+                // Since A is packed into row panels, we must use a loop over
+                // gemv.
+                dim_t m_iter = ( m0 + mr - 1 ) / mr;
+                dim_t m_left =   m0            % mr;
+
+                float* restrict ai_ii  = ai;
+                float* restrict cij_ii = cij;
+
+                for ( dim_t ii = 0; ii < m_iter; ii += 1 )
+                {
+                    dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
+                                     ? mr : m_left );
+
+                    bli_sgemv_ex 
+                    (
+                      BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+                      alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
+                      beta, cij_ii, rs_c0, cntx, NULL
+                    );
+                    cij_ii += mr_cur*rs_c0;
+                    ai_ii  += ps_a0;
+                } 
+            }
+            n_left -= nr_cur;
+        }
+    }
+}
+
+void bli_sgemmsup_rv_zen_asm_1x64n_avx512
+     (
+       conj_t              conja,
+       conj_t              conjb,
+       dim_t               m0,
+       dim_t               n0,
+       dim_t               k0,
+       float*     restrict alpha,
+       float*     restrict a,     inc_t rs_a0, inc_t cs_a0,
+       float*     restrict b,     inc_t rs_b0, inc_t cs_b0,
+       float*     restrict beta,
+       float*     restrict c,     inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    uint64_t k_iter = k0 / 4;
+    uint64_t k_left = k0 % 4;
+
+    uint64_t n_iter = n0 / 64;
+    uint64_t n_left = n0 % 64;
+
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    float *abuf = a;
+    float *bbuf = b;
+    float *cbuf = c;
+
+    if ( n_iter == 0 ) goto consider_edge_cases;
+
+    /*Produce MRXNR outputs */
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov( var( rs_a ), r8 )          // load rs_a
+    lea( mem( , r8, 4 ), r8 )       // rs_a *= sizeof(dt) => rs_a *= 4
+    mov( var( rs_b ), r9 )          // load rs_b
+    lea( mem( , r9, 4 ), r9 )       // rs_b *= sizeof(dt) => rs_b *= 4
+    mov( var( cs_a ), r10 )         // load cs_a
+    lea( mem( , r10, 4 ), r10 )     // cs_a *= sizeof(dt) => cs_a *= 4
+    lea( mem( r8, r8, 2 ), r13 )    // r13 = 3 * rs_a
+    lea( mem( r8, r8, 4 ), r15 )    // r15 = 5 * rs_a
+
+    mov( var( n_iter ), r11 )       // load n_iter
+
+    label( .N_LOOP_ITER )
+
+    mov( var( rs_c ), rdi )         // load rs_c
+    lea( mem( , rdi, 4 ), rdi )     // rs_c *= sizeof(float)
+
+    INIT_REG
+
+    mov( var( abuf ), rax )         // load address of a
+    mov( var( bbuf ), rbx )         // load address of b
+    mov( var( cbuf ), rcx )         // load address of c
+
+    mov( var( alpha ), rdx )        // load address of alpha
+    vbroadcastss( ( rdx ), zmm7 )
+
+    mov( var( k_iter ), rsi )       // load k_iter
+    test( rsi, rsi )
+    je( .CONSID_K_LEFT )
+
+    // The k-loop iterates over 4 rows of B, and broadcasts of each row of A.
+    label( .K_LOOP_ITER )
+    // ITER 0
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 1
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 2
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    // ITER 3
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+
+    dec( rsi )
+    jne( .K_LOOP_ITER )     // if rsi != 0, repeat k-loop
+
+
+    label( .CONSID_K_LEFT )
+
+    mov( var( k_left ), rsi )       // i = k_left;
+    test( rsi, rsi )                // check i via logical AND.
+    je( .SPOSTACCUM )               // if i == 0, we're done; jump to end.
+                                    // else, we prepare to enter k_left loop.
+
+
+    label( .K_LEFT_LOOP )
+
+    // Load 4 rows from B matrix.
+    vmovups(     ( rbx ), zmm0 )
+    vmovups( 0x40( rbx ), zmm1 )
+    vmovups( 0x80( rbx ), zmm2 )
+    vmovups( 0xc0( rbx ), zmm3 )
+
+    // Broadcast 3 elements from a row of A & do VFMA with rows of B.
+    vbroadcastss( ( rax ), zmm4 )
+    VFMA4( 4, 8, 9, 10, 11 )
+
+    add(  r9, rbx )
+    add( r10, rax )
+    dec( rsi )
+    jne( .K_LEFT_LOOP )     // if rsi != 0, repeat k-loop
+
+
+    label( .SPOSTACCUM )
+
+    // Scaling A * B with alpha.
+    ALPHA_SCALE4( 7,  8,  9, 10, 11 )
+
+    mov( var( beta ), rdx )         // load address of beta
+    vbroadcastss( ( rdx ), zmm4 )
+
+    vxorps( xmm1, xmm1, xmm1 )
+    vucomiss( xmm1, xmm4 )          // check if beta = 0
+    je( .SBETAZERO )                // jump to beta = 0 case
+
+    cmp( imm(4), rdi )              // set ZF if (4*rs_c) == 4
+    jz( .SCOLSTORED )               // jump to column storage case
+
+
+    label( .SROWSTORED )
+
+    UPDATE_C4( 4,  8,  9, 10, 11 )
+ 
+    jmp( .SDONE )               // jump to the end
+
+
+    label( .SCOLSTORED )
+
+    /* Transposing 1x16 tiles to 16x1 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rdi = rs_c *= sizeof(dt) => rs_c *= 4
+    lea( mem( rdi, rdi, 2 ), r12 )      // rdi += rdi * 2 => rdi = 3 * rs_c
+
+    UPDATE_C_1X16(  8 )
+    UPDATE_C_1X16(  9 )
+    UPDATE_C_1X16( 10 )
+    UPDATE_C_1X16( 11 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SBETAZERO )
+
+    cmp( imm( 4 ), rdi )                // set ZF if (4*rs_c) == 4.
+    jz( .SCOLSTORBZ )                   // jump to column storage case
+
+
+    label( .SROWSTORBZ )
+
+    UPDATE_C4_BZ( 8, 9, 10, 11 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SCOLSTORBZ )
+
+    /* Transposing 2x16 tiles to 16x2 tiles */
+    mov( var( cbuf ), rcx )             // load address of c
+    mov( var( cs_c ), rdi )             // load rs_c
+    lea( mem( , rdi, 4 ), rdi )         // rs_c *= sizeof(float)
+    lea( mem( rdi, rdi, 2 ), r12 )
+
+    UPDATE_C_1X16_BZ(  8 )
+    UPDATE_C_1X16_BZ(  9 )
+    UPDATE_C_1X16_BZ( 10 )
+    UPDATE_C_1X16_BZ( 11 )
+
+    jmp( .SDONE )                       // jump to the end
+
+
+    label( .SDONE )
+
+    mov( var( cs_b ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_b * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx += cs_b * 8  => rdx = cs_b * 16
+    mov( var( bbuf ), rbx )
+    add( rdx, rbx )
+    mov( rbx, var( bbuf ) )
+
+    mov( var( cs_c ), rdx )
+    lea( mem( , rdx, 4 ), rdx )
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = cs_c * 8
+    lea( mem( , rdx, 8 ), rdx )     // rdx  = rdx * 8 = cs_c * 8 * 8 => rdx = cs_c * 64
+    mov( var( cbuf ), rcx )              // load address of c
+    add( rdx, rcx )                    // c += rs_c * MR
+    mov( rcx, var( cbuf ) )              // store updated c
+
+    dec( r11 )
+    jne( .N_LOOP_ITER )
+
+    end_asm(
+    : // output operands (none)
+    : // input operands
+      [k_iter] "m" (k_iter),
+      [k_left] "m" (k_left),
+      [a]      "m" (a),
+      [rs_a]   "m" (rs_a),
+      [cs_a]   "m" (cs_a),
+      [b]      "m" (b),
+      [rs_b]   "m" (rs_b),
+      [cs_b]   "m" (cs_b),
+      [alpha]  "m" (alpha),
+      [beta]   "m" (beta),
+      [c]      "m" (c),
+      [rs_c]   "m" (rs_c),
+      [cs_c]   "m" (cs_c),
+      [n0]     "m" (n0),
+      [m0]     "m" (m0),
+      [n_iter] "m" (n_iter),
+      [abuf]   "m" (abuf),
+      [bbuf]   "m" (bbuf),
+      [cbuf]   "m" (cbuf)
+    : // register clobber list
+      "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+      "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm12",
+      "zmm0", "zmm1", "zmm2", "zmm3",
+      "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+      "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+      "zmm16", "zmm17", "zmm18", "zmm19",
+      "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+      "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+      "memory"
+    )
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if ( n_left )
+    {
+        const dim_t mr_cur = 1;
+        const dim_t j_edge = n0 - ( dim_t )n_left;
+
+        uint64_t ps_b   = bli_auxinfo_ps_b( data );
+
+        float* restrict cij = c + j_edge*cs_c;
+        float* restrict bj  = b + n_iter * ps_b;
+        float* restrict ai  = a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+            bli_sgemmsup_rv_zen_asm_1x48m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+            bli_sgemmsup_rv_zen_asm_1x32m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+            bli_sgemmsup_rv_zen_asm_1x16m_avx512
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+            bli_sgemmsup_rv_zen_asm_1x8
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0; 
+            n_left -= nr_cur;
+        }
+
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+            bli_sgemmsup_rv_zen_asm_1x4
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+            bli_sgemmsup_rv_zen_asm_1x2
+            (
+              conja,conjb,mr_cur,nr_cur,k0,
+              alpha,ai,rs_a0,cs_a0,
+              bj,rs_b0,cs_b0,beta,
+              cij,rs_c0,cs_c0,
+              data,cntx
+            );
+            cij += nr_cur*cs_c0;
+            bj  += nr_cur*cs_b0;
+            n_left -= nr_cur;
+        }
+
+        if ( 1 <= n_left )
+        {
+            const dim_t nr_cur = 1;
+            dim_t ps_a0 = bli_auxinfo_ps_a( data );
+            if ( ps_a0 == 1 * rs_a0 )
+            {
+                bli_sgemv_ex
+                (
+                  BLIS_NO_TRANSPOSE, conjb, m0, k0,
+                  alpha, ai, rs_a0, cs_a0, bj, rs_b0,
+                  beta, cij, rs_c0, cntx, NULL
+                );
+            }
+            else
+            {
+                const dim_t mr = 2;
+
+                // Since A is packed into row panels, we must use a loop over
+                // gemv.
+                dim_t m_iter = ( m0 + mr - 1 ) / mr;
+                dim_t m_left =   m0            % mr;
+
+                float* restrict ai_ii  = ai;
+                float* restrict cij_ii = cij;
+
+                for ( dim_t ii = 0; ii < m_iter; ii += 1 )
+                {
+                    dim_t mr_cur = ( bli_is_not_edge_f( ii, m_iter, m_left )
+                                     ? mr : m_left );
+
+                    bli_sgemv_ex 
+                    (
+                      BLIS_NO_TRANSPOSE, conjb, mr_cur, k0,
+                      alpha, ai_ii, rs_a0, cs_a0, bj, rs_b0,
+                      beta, cij_ii, rs_c0, cntx, NULL
+                    );
+                    cij_ii += mr_cur*rs_c0;
+                    ai_ii  += ps_a0;
+                } 
+            }
+            n_left -= nr_cur;
+        }
+    }
+}
diff --git a/kernels/zen4/3/sup/d24x8/CMakeLists.txt b/kernels/zen4/3/sup/d24x8/CMakeLists.txt
new file mode 100644
index 0000000000..004a07c085
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/CMakeLists.txt
@@ -0,0 +1,18 @@
+##Copyright (C) 2020-2023, Advanced Micro Devices, Inc. All rights reserved.##
+
+add_library(zen4_3supd24x8
+     OBJECT
+${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx1.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx2.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx3.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx4.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx5.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx6.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx7.c
+${CMAKE_CURRENT_SOURCE_DIR}/bli_dgemmsup_rv_zen4_asm_Mx8.c
+    )
+
+target_compile_options(zen4_3supd24x8 PRIVATE /arch:AVX2 /arch:AVX512)
+if(BUILD_SHARED_LIBS)
+    target_compile_definitions(zen4_3supd24x8 PUBLIC -DBLIS_IS_BUILDING_LIBRARY)
+endif()
diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c
new file mode 100644
index 0000000000..d8806362e8
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx1.c
@@ -0,0 +1,2239 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1.
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_8_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3))) \
+\
+    vmovupd( zmm8, mem(rcx, rdx, 1) MASK_(k(3))) \
+    add(r14, rcx)
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_7_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_6_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_5_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_4_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_3_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_2_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_1_BZ \
+\
+    vmovupd( zmm0, mem(rcx) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_8 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm8 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))\
+    vmovupd( zmm8, (rcx, rdx, 1) MASK_(k(3)))\
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_7 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_6 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_5 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_4 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_3 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_2 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_1 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x1
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm28, zmm28, zmm28)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 1+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(1), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 3
+        prefetchw0( mem(rdx, 128))                        // prefetch C
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm28,zmm28 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( 0x80(rcx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm28)
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x1 tile updated
+
+    vunpcklpd( zmm9,  zmm7,  zmm0)
+    vunpckhpd( zmm9,  zmm7,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8
+    //Second 8x1 tile updated
+
+    vunpcklpd( zmm29,  zmm28,  zmm0)
+    vunpckhpd( zmm29,  zmm28,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 8x1 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x1 tile updated
+
+    vunpcklpd( zmm9,  zmm7,  zmm0)
+    vunpckhpd( zmm9,  zmm7,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //Second 8x1 tile updated
+
+    vunpcklpd( zmm29,  zmm28,  zmm0)
+    vunpckhpd( zmm29,  zmm28,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_16x1
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 1+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(1), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x1 tile updated
+
+    vunpcklpd( zmm9,  zmm7,  zmm0)
+    vunpckhpd( zmm9,  zmm7,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //8x1 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x1 tile updated
+
+    vunpcklpd( zmm9,  zmm7,  zmm0)
+    vunpckhpd( zmm9,  zmm7,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0] "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_8x1
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 1+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(1), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //8x1 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c
new file mode 100644
index 0000000000..d8b5c73ad8
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx2.c
@@ -0,0 +1,2510 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1.
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_8_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3))) \
+\
+    vmovupd( zmm8, mem(rcx, rdx, 1) MASK_(k(3))) \
+    add(r14, rcx)
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_7_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_6_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_5_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_4_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_3_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_2_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_1_BZ \
+\
+    vmovupd( zmm0, mem(rcx) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_8 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm8 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))\
+    vmovupd( zmm8, (rcx, rdx, 1) MASK_(k(3)))\
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_7 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_6 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_5 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_4 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_3 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_2 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_1 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x2
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm28, zmm28, zmm28)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm29, zmm29, zmm29)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 2+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(2), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 3
+        prefetchw0( mem(rdx, 128))                        // prefetch C
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm28,zmm28 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm29,zmm29 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( 0x80(rcx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm28)
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm29)
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x2 tile updated
+
+    vunpcklpd( zmm9,  zmm7,  zmm0)
+    vunpckhpd( zmm9,  zmm7,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8
+    //Second 8x2 tile updated
+
+    vunpcklpd( zmm29,  zmm28,  zmm0)
+    vunpckhpd( zmm29,  zmm28,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 8x2 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x2 tile updated
+
+    vunpcklpd( zmm9,  zmm7,  zmm0)
+    vunpckhpd( zmm9,  zmm7,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //Second 8x2 tile updated
+
+    vunpcklpd( zmm29,  zmm28,  zmm0)
+    vunpckhpd( zmm29,  zmm28,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_16x2
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 2+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(2), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x2 tile updated
+
+    vunpcklpd( zmm9,  zmm7,  zmm0)
+    vunpckhpd( zmm9,  zmm7,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Second 8x2 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x2 tile updated
+
+    vunpcklpd( zmm9,  zmm7,  zmm0)
+    vunpckhpd( zmm9,  zmm7,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_8x2
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm8, zmm8, zmm8)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 2+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(2), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm8,zmm8 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //8x2 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    vunpcklpd( zmm8,  zmm6,  zmm0)
+    vunpckhpd( zmm8,  zmm6,  zmm1)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c
new file mode 100644
index 0000000000..a739183e98
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx3.c
@@ -0,0 +1,2770 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1.
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_8_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3))) \
+\
+    vmovupd( zmm8, mem(rcx, rdx, 1) MASK_(k(3))) \
+    add(r14, rcx)
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_7_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_6_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_5_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_4_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_3_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_2_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_1_BZ \
+\
+    vmovupd( zmm0, mem(rcx) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_8 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm8 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))\
+    vmovupd( zmm8, (rcx, rdx, 1) MASK_(k(3)))\
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_7 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_6 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_5 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_4 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_3 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_2 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_1 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x3
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm28, zmm28, zmm28)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm29, zmm29, zmm29)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm26, zmm26, zmm26)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 3+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(3), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 3
+        prefetchw0( mem(rdx, 128))                        // prefetch C
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm28,zmm28 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm29,zmm29 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm26,zmm26 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( 0x80(rcx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm28)
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm29)
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm26)
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x3 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8
+    //Second 8x3 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 8x3 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x3 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //Second 8x3 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0] "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_16x3
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 3+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(3), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x3 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Second 8x3 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x3 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_8x3
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+    // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm10, zmm10, zmm10)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 3+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(3), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm10,zmm10 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //8x3 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    //8x3 tile updated
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c
new file mode 100644
index 0000000000..e5d70ae5fd
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx4.c
@@ -0,0 +1,3038 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1.
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_8_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3))) \
+\
+    vmovupd( zmm8, mem(rcx, rdx, 1) MASK_(k(3))) \
+    add(r14, rcx)
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_7_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_6_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_5_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_4_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_3_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_2_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_1_BZ \
+\
+    vmovupd( zmm0, mem(rcx) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_8 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm8 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))\
+    vmovupd( zmm8, (rcx, rdx, 1) MASK_(k(3)))\
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_7 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_6 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_5 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_4 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_3 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_2 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_1 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x4
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm28, zmm28, zmm28)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm29, zmm29, zmm29)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm26, zmm26, zmm26)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm27,zmm27, zmm27)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 4+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(4), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 3
+        prefetchw0( mem(rdx, 128))                        // prefetch C
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm28,zmm28 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm29,zmm29 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm26,zmm26 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm27,zmm27 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( 0x80(rcx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm28)
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm29)
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm26)
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm27)
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    lea(mem(rsi,  rsi,  2), r12)
+    lea(mem(r12, rsi,  2), r13)
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x4 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8
+    //Second 8x4 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 8x4 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    lea(mem(rsi,  rsi,  2), r12)
+    lea(mem(r12, rsi,  2), r13)
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x4 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //Second 8x4 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_16x4
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 4+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(4), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x4 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Second 8x4 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x4 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_8x4
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm12, zmm12, zmm12)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 4+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(4), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm12,zmm12 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //8x4 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c
new file mode 100644
index 0000000000..a41cbc4905
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx5.c
@@ -0,0 +1,3439 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1.
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_8_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3))) \
+\
+    vmovupd( zmm8, mem(rcx, rdx, 1) MASK_(k(3))) \
+    add(r14, rcx)
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_7_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_6_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_5_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_4_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_3_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_2_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_1_BZ \
+\
+    vmovupd( zmm0, mem(rcx) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_8 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm8 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))\
+    vmovupd( zmm8, (rcx, rdx, 1) MASK_(k(3)))\
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_7 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_6 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_5 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_4 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_3 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_2 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_1 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x5
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm28, zmm28, zmm28)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm29, zmm29, zmm29)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm26, zmm26, zmm26)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm27,zmm27, zmm27)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm15, zmm15, zmm15)
+    vxorpd(zmm24, zmm24, zmm24)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 5+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(5), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 3
+        prefetchw0( mem(rdx, 128))                        // prefetch C
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm28,zmm28 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm29,zmm29 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm26,zmm26 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm27,zmm27 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm15,zmm15 )
+    vmulpd( zmm30,zmm24,zmm24 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( 0x80(rcx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm28)
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm29)
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm26)
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm27)
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx))
+    vmovupd( 0x40(rdx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm15)
+    vmovupd( zmm15,0x40(rdx))
+    vmovupd( 0x80(rdx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm24)
+    vmovupd( zmm24,0x80(rdx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    lea(mem(rsi,  rsi,  2), r12)
+    lea(mem(r12, rsi,  2), r13)
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x5 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm17, zmm15, zmm0)
+    vunpckhpd(zmm17, zmm15, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8
+    //Second 8x5 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm25, zmm24, zmm0)
+    vunpckhpd(zmm25, zmm24, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 8x8 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx))
+    vmovupd( zmm15,0x40(rdx))
+    vmovupd( zmm24,0x80(rdx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    lea(mem(rsi,  rsi,  2), r12)
+    lea(mem(r12, rsi,  2), r13)
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x5 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm17, zmm15, zmm0)
+    vunpckhpd(zmm17, zmm15, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //Second 8x5 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm25, zmm24, zmm0)
+    vunpckhpd(zmm25, zmm24, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_16x5
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm15, zmm15, zmm15)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 5+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(5), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm15,zmm15 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx))
+    vmovupd( 0x40(rdx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm15)
+    vmovupd( zmm15,0x40(rdx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // r12 = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // r13 = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x5 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm17, zmm15, zmm0)
+    vunpckhpd(zmm17, zmm15, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Second 8x5 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx))
+    vmovupd( zmm15,0x40(rdx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x5 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm17, zmm15, zmm0)
+    vunpckhpd(zmm17, zmm15, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_8x5
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm14, zmm14, zmm14)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 5+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(5), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm14,zmm14 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //8x5 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c
new file mode 100644
index 0000000000..fe638c320f
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx6.c
@@ -0,0 +1,3707 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1.
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_8_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3))) \
+\
+    vmovupd( zmm8, mem(rcx, rdx, 1) MASK_(k(3))) \
+    add(r14, rcx)
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_7_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_6_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_5_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_4_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_3_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_2_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_1_BZ \
+\
+    vmovupd( zmm0, mem(rcx) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_8 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm8 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))\
+    vmovupd( zmm8, (rcx, rdx, 1) MASK_(k(3)))\
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_7 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_6 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_5 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_4 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_3 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_2 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_1 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x6
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm28, zmm28, zmm28)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm29, zmm29, zmm29)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm26, zmm26, zmm26)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm27,zmm27, zmm27)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm15, zmm15, zmm15)
+    vxorpd(zmm24, zmm24, zmm24)
+    vxorpd(zmm16, zmm16, zmm16)
+    vxorpd(zmm17, zmm17, zmm17)
+    vxorpd(zmm25, zmm25, zmm25)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 6+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(6), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 3
+        prefetchw0( mem(rdx, 128))                        // prefetch C
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm28,zmm28 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm29,zmm29 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm26,zmm26 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm27,zmm27 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm15,zmm15 )
+    vmulpd( zmm30,zmm24,zmm24 )
+    vmulpd( zmm30,zmm16,zmm16 )
+    vmulpd( zmm30,zmm17,zmm17 )
+    vmulpd( zmm30,zmm25,zmm25 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( 0x80(rcx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm28)
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm29)
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm26)
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm27)
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx))
+    vmovupd( 0x40(rdx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm15)
+    vmovupd( zmm15,0x40(rdx))
+    vmovupd( 0x80(rdx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm24)
+    vmovupd( zmm24,0x80(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( 0x40(rdx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm17)
+    vmovupd( zmm17,0x40(rdx,rdi,1))
+    vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm25)
+    vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    lea(mem(rsi,  rsi,  2), r12)
+    lea(mem(r12, rsi,  2), r13)
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x6 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm17, zmm15, zmm0)
+    vunpckhpd(zmm17, zmm15, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8
+    //Second 8x6 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm25, zmm24, zmm0)
+    vunpckhpd(zmm25, zmm24, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 7x6 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx))
+    vmovupd( zmm15,0x40(rdx))
+    vmovupd( zmm24,0x80(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( zmm17,0x40(rdx,rdi,1))
+    vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    lea(mem(rsi,  rsi,  2), r12)
+    lea(mem(r12, rsi,  2), r13)
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x6 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm17, zmm15, zmm0)
+    vunpckhpd(zmm17, zmm15, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //Second 8x6 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm25, zmm24, zmm0)
+    vunpckhpd(zmm25, zmm24, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_16x6
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm15, zmm15, zmm15)
+    vxorpd(zmm16, zmm16, zmm16)
+    vxorpd(zmm17, zmm17, zmm17)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 6+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(6), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm15,zmm15 )
+    vmulpd( zmm30,zmm16,zmm16 )
+    vmulpd( zmm30,zmm17,zmm17 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx))
+    vmovupd( 0x40(rdx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm15)
+    vmovupd( zmm15,0x40(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm17)
+    vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x6 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm17, zmm15, zmm0)
+    vunpckhpd(zmm17, zmm15, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Second 7x6 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx))
+    vmovupd( zmm15,0x40(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x6 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    vunpcklpd(zmm17, zmm15, zmm0)
+    vunpckhpd(zmm17, zmm15, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_8x6
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm16, zmm16, zmm16)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 6+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(6), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm16,zmm16 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //7x6 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    vunpcklpd(zmm16, zmm14, zmm0)
+    vunpckhpd(zmm16, zmm14, zmm1)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c
new file mode 100644
index 0000000000..610871ab2e
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx7.c
@@ -0,0 +1,3968 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1.
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_8_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3))) \
+\
+    vmovupd( zmm8, mem(rcx, rdx, 1) MASK_(k(3))) \
+    add(r14, rcx)
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_7_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm3, mem(rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_6_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3))) \
+\
+    vmovupd( zmm5, mem(rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_5_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) ) \
+\
+    vmovupd( zmm1, mem(rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_4_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) ) \
+\
+    vmovupd( zmm6, mem(rcx, r12, 1) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_3_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3))) \
+\
+    vmovupd( zmm2, mem(rcx, rsi, 2) MASK_(k(3)) )
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_2_BZ \
+    vmovupd( zmm0, mem(rcx) MASK_(k(3))) \
+\
+    vmovupd( zmm4, mem(rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * mask register is set, stores the fma result back to C
+*/
+#define UPDATE_MASKED_C_1_BZ \
+\
+    vmovupd( zmm0, mem(rcx) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_8 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( mem(rcx, rdx, 1, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm8 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))\
+    vmovupd( zmm8, (rcx, rdx, 1) MASK_(k(3)))\
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_7 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( mem(rcx, r12, 2, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm3 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))\
+    vmovupd( zmm3, (rcx, r12, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_6 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( mem(rcx, r13, 1, 0), zmm18 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm18,zmm5 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))\
+    vmovupd( zmm5, (rcx, r13, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_5 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( mem(rcx, rsi, 4, 0), zmm14 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm14,zmm1 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))\
+    vmovupd( zmm1, (rcx, rsi, 4) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_4 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( mem(rcx, r12, 1, 0), zmm16 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm16,zmm6 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))\
+    vmovupd( zmm6, (rcx, r12, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_3 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( mem(rcx, rsi, 2, 0), zmm12 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm12,zmm2 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))\
+    vmovupd( zmm2, (rcx, rsi, 2) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_2 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( mem(rcx, rsi, 1, 0), zmm10 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm10,zmm4 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/\
+    vmovupd( zmm4, (rcx, rsi, 1) MASK_(k(3)))
+
+/**
+ * Loads elements from C row only if correspondnig bits in
+ * mask register is set, Scales it with Beta and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_MASKED_C_1 \
+\
+    vmovupd( mem(rcx), zmm30 MASK_KZ(3) ) \
+    vfmadd231pd( zmm31,zmm30,zmm0 ) \
+\
+    vmovupd( zmm0, (rcx) MASK_(k(3)))            /*Stores back to C*/
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x7
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)           // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm28, zmm28, zmm28)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm29, zmm29, zmm29)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm26, zmm26, zmm26)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm27,zmm27, zmm27)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm15, zmm15, zmm15)
+    vxorpd(zmm24, zmm24, zmm24)
+    vxorpd(zmm16, zmm16, zmm16)
+    vxorpd(zmm17, zmm17, zmm17)
+    vxorpd(zmm25, zmm25, zmm25)
+    vxorpd(zmm18, zmm18, zmm18)
+    vxorpd(zmm19, zmm19, zmm19)
+    vxorpd(zmm22, zmm22, zmm22)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 7+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(7), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 3
+        prefetchw0( mem(rdx, 128))                        // prefetch C
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm28,zmm28 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm29,zmm29 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm26,zmm26 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm27,zmm27 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm15,zmm15 )
+    vmulpd( zmm30,zmm24,zmm24 )
+    vmulpd( zmm30,zmm16,zmm16 )
+    vmulpd( zmm30,zmm17,zmm17 )
+    vmulpd( zmm30,zmm25,zmm25 )
+    vmulpd( zmm30,zmm18,zmm18 )
+    vmulpd( zmm30,zmm19,zmm19 )
+    vmulpd( zmm30,zmm22,zmm22 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( 0x80(rcx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm28)
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm29)
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm26)
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm27)
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx))
+    vmovupd( 0x40(rdx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm15)
+    vmovupd( zmm15,0x40(rdx))
+    vmovupd( 0x80(rdx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm24)
+    vmovupd( zmm24,0x80(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( 0x40(rdx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm17)
+    vmovupd( zmm17,0x40(rdx,rdi,1))
+    vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm25)
+    vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm18)
+    vmovupd( zmm18,(rdx,rdi,2))
+    vmovupd( 0x40(rdx,rdi,2),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm19)
+    vmovupd( zmm19,0x40(rdx,rdi,2))
+    vmovupd( 0x80(rdx,rdi,2),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm22)
+    vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    lea(mem(rsi,  rsi,  2), r12)
+    lea(mem(r12, rsi,  2), r13)
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x7 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8
+    //Second 8x7 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(25, 24, 0, 1, 23, 22, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 7x8 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx))
+    vmovupd( zmm15,0x40(rdx))
+    vmovupd( zmm24,0x80(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( zmm17,0x40(rdx,rdi,1))
+    vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm18,(rdx,rdi,2))
+    vmovupd( zmm19,0x40(rdx,rdi,2))
+    vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    lea(mem(rsi,  rsi,  2), r12)
+    lea(mem(r12, rsi,  2), r13)
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x7 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //Second 8x7 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(25, 24, 0, 1, 23, 22, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_16x7
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm15, zmm15, zmm15)
+    vxorpd(zmm16, zmm16, zmm16)
+    vxorpd(zmm17, zmm17, zmm17)
+    vxorpd(zmm18, zmm18, zmm18)
+    vxorpd(zmm19, zmm19, zmm19)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 7+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(7), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm15,zmm15 )
+    vmulpd( zmm30,zmm16,zmm16 )
+    vmulpd( zmm30,zmm17,zmm17 )
+    vmulpd( zmm30,zmm18,zmm18 )
+    vmulpd( zmm30,zmm19,zmm19 )
+    vmulpd( zmm30,zmm22,zmm22 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx))
+    vmovupd( 0x40(rdx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm15)
+    vmovupd( zmm15,0x40(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm17)
+    vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm18)
+    vmovupd( zmm18,(rdx,rdi,2))
+    vmovupd( 0x40(rdx,rdi,2),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm19)
+    vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_MASKED_C_8
+    //First 8x7 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 7x8 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx))
+    vmovupd( zmm15,0x40(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm18,(rdx,rdi,2))
+    vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_MASKED_C_8_BZ
+    //First 8x7 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_8x7
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+    uint8_t mask_n0 = 0xff >> (0x8 - (n0 & 7)); // calculate mask based on n_left
+    // For special cases where n_left = 8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask_n0 == 0) mask_n0 = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    mov(var(mask_n0), rdx)          // load mask
+    kmovw(edx, k(3))                // move mask to k3 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm16, zmm16, zmm16)
+    vxorpd(zmm18, zmm18, zmm18)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 7+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 8
+
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(7), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+
+        // ---------------------------------- iteration 8
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm16,zmm16 )
+    vmulpd( zmm30,zmm18,zmm18 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,2),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm18)
+    vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_MASKED_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_MASKED_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_MASKED_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_MASKED_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_MASKED_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_MASKED_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_MASKED_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_MASKED_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 7x7 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_MASKED_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_MASKED_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_MASKED_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_MASKED_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_MASKED_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_MASKED_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_MASKED_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_MASKED_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask),
+        [mask_n0]   "m" (mask_n0)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "k3", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c
new file mode 100644
index 0000000000..8cf46b43c5
--- /dev/null
+++ b/kernels/zen4/3/sup/d24x8/bli_dgemmsup_rv_zen4_asm_Mx8.c
@@ -0,0 +1,4182 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+   OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+   OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include "blis.h"
+#define BLIS_ASM_SYNTAX_ATT
+#include "bli_x86_asm_macros.h"
+#define TAIL_NITER 3
+
+/**
+ * Shuffle 2 double-precision elements selected by imm8 from S1 and S2,
+ * and store the results in D1.
+ * S1 : 1  9 3 11 5 13 7 15
+ * S2 : 2 10 4 12 6 14 8 16
+ * D1 : 1  9  5  13  2  10  6  14
+ * D2 : 3 11  7  15  4  12  8  16
+*/
+#define SHUFFLE_DATA(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    VSHUFF64X2(IMM(0x88), ZMM(S1), ZMM(S2), ZMM(D1)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S1), ZMM(S2), ZMM(D2)) \
+    VSHUFF64X2(IMM(0x88), ZMM(S3), ZMM(S4), ZMM(D3)) \
+    VSHUFF64X2(IMM(0xDD), ZMM(S3), ZMM(S4), ZMM(D4)) \
+
+/**
+ * Unpacks and interleave low half and high half of each
+ * 128-bit lane in S1 and S2 and store into D1 and D2
+ * respectively.
+ * S1 : 1  2  3  4  5  6  7  8
+ * S2 : 9 10 11 12 13 14 15 16
+ * D1 : 1  9 3 11 5 13 7 15
+ * D2 : 2 10 4 12 6 14 8 16
+*/
+#define UNPACK_LO_HIGH(S1, S2, D1, D2, S3, S4, D3, D4) \
+\
+    vunpcklpd( zmm(S1),  zmm(S2),  zmm(D1)) \
+    vunpckhpd( zmm(S1),  zmm(S2),  zmm(D2)) \
+    vunpcklpd( zmm(S3),  zmm(S4),  zmm(D3)) \
+    vunpckhpd( zmm(S3),  zmm(S4),  zmm(D4))
+
+/**
+ * Stores fma result back to C
+*/
+#define UPDATE_C_8_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+\
+    vmovupd( zmm4, (rcx, rsi, 1) ) \
+\
+    vmovupd( zmm2, (rcx, rsi, 2) ) \
+\
+    vmovupd( zmm6, (rcx, r12, 1) ) \
+\
+    vmovupd( zmm1, (rcx, rsi, 4) ) \
+\
+    vmovupd( zmm5, (rcx, r13, 1) ) \
+\
+    vmovupd( zmm3, (rcx, r12, 2) ) \
+\
+    vmovupd( zmm8, (rcx, rdx, 1) ) \
+    add(r14, rcx)
+
+/**
+ * Stores fma result back to C
+*/
+#define UPDATE_C_7_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+\
+    vmovupd( zmm4, (rcx, rsi, 1) ) \
+\
+    vmovupd( zmm2, (rcx, rsi, 2) ) \
+\
+    vmovupd( zmm6, (rcx, r12, 1) ) \
+\
+    vmovupd( zmm1, (rcx, rsi, 4) ) \
+\
+    vmovupd( zmm5, (rcx, r13, 1) ) \
+\
+    vmovupd( zmm3, (rcx, r12, 2) )
+
+/**
+ * Stores fma result back to C
+*/
+#define UPDATE_C_6_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+\
+    vmovupd( zmm4, (rcx, rsi, 1) ) \
+\
+    vmovupd( zmm2, (rcx, rsi, 2) ) \
+\
+    vmovupd( zmm6, (rcx, r12, 1) ) \
+\
+    vmovupd( zmm1, (rcx, rsi, 4) ) \
+\
+    vmovupd( zmm5, (rcx, r13, 1) )
+
+/**
+ * Stores fma result back to C
+*/
+#define UPDATE_C_5_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+\
+    vmovupd( zmm4, (rcx, rsi, 1) ) \
+\
+    vmovupd( zmm2, (rcx, rsi, 2) ) \
+\
+    vmovupd( zmm6, (rcx, r12, 1) ) \
+\
+    vmovupd( zmm1, (rcx, rsi, 4) )
+
+/**
+ * Stores fma result back to C
+*/
+#define UPDATE_C_4_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+\
+    vmovupd( zmm4, (rcx, rsi, 1) ) \
+\
+    vmovupd( zmm2, (rcx, rsi, 2) ) \
+\
+    vmovupd( zmm6, (rcx, r12, 1) )
+
+/**
+ * Stores fma result back to C
+*/
+#define UPDATE_C_3_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+\
+    vmovupd( zmm4, (rcx, rsi, 1) ) \
+\
+    vmovupd( zmm2, (rcx, rsi, 2) )
+
+/**
+ * Stores fma result back to C
+*/
+#define UPDATE_C_2_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+\
+    vmovupd( zmm4, (rcx, rsi, 1) )
+
+/**
+ * Stores fma result back to C
+*/
+#define UPDATE_C_1_BZ \
+\
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/ \
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C_8 \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/\
+\
+    vfmadd231pd( mem(rcx, rsi, 1),zmm31,zmm4 ) \
+    vmovupd( zmm4, (rcx, rsi, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 2),zmm31,zmm2 ) \
+    vmovupd( zmm2, (rcx, rsi, 2) )\
+\
+    vfmadd231pd( mem(rcx, r12, 1),zmm31,zmm6 ) \
+    vmovupd( zmm6, (rcx, r12, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 4),zmm31,zmm1 ) \
+    vmovupd( zmm1, (rcx, rsi, 4) )\
+\
+    vfmadd231pd( mem(rcx, r13, 1),zmm31,zmm5 ) \
+    vmovupd( zmm5, (rcx, r13, 1) )\
+\
+    vfmadd231pd( mem(rcx, r12, 2),zmm31,zmm3 ) \
+    vmovupd( zmm3, (rcx, r12, 2) )\
+\
+    vfmadd231pd( mem(rcx, rdx, 1),zmm31,zmm8 ) \
+    vmovupd( zmm8, (rcx, rdx, 1) )\
+    add(r14, rcx)
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C_7 \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/\
+\
+    vfmadd231pd( mem(rcx, rsi, 1),zmm31,zmm4 ) \
+    vmovupd( zmm4, (rcx, rsi, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 2),zmm31,zmm2 ) \
+    vmovupd( zmm2, (rcx, rsi, 2) )\
+\
+    vfmadd231pd( mem(rcx, r12, 1),zmm31,zmm6 ) \
+    vmovupd( zmm6, (rcx, r12, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 4),zmm31,zmm1 ) \
+    vmovupd( zmm1, (rcx, rsi, 4) )\
+\
+    vfmadd231pd( mem(rcx, r13, 1),zmm31,zmm5 ) \
+    vmovupd( zmm5, (rcx, r13, 1) )\
+\
+    vfmadd231pd( mem(rcx, r12, 2),zmm31,zmm3 ) \
+    vmovupd( zmm3, (rcx, r12, 2) )
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C_6 \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/\
+\
+    vfmadd231pd( mem(rcx, rsi, 1),zmm31,zmm4 ) \
+    vmovupd( zmm4, (rcx, rsi, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 2),zmm31,zmm2 ) \
+    vmovupd( zmm2, (rcx, rsi, 2) )\
+\
+    vfmadd231pd( mem(rcx, r12, 1),zmm31,zmm6 ) \
+    vmovupd( zmm6, (rcx, r12, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 4),zmm31,zmm1 ) \
+    vmovupd( zmm1, (rcx, rsi, 4) )\
+\
+    vfmadd231pd( mem(rcx, r13, 1),zmm31,zmm5 ) \
+    vmovupd( zmm5, (rcx, r13, 1) )
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C_5 \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/\
+\
+    vfmadd231pd( mem(rcx, rsi, 1),zmm31,zmm4 ) \
+    vmovupd( zmm4, (rcx, rsi, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 2),zmm31,zmm2 ) \
+    vmovupd( zmm2, (rcx, rsi, 2) )\
+\
+    vfmadd231pd( mem(rcx, r12, 1),zmm31,zmm6 ) \
+    vmovupd( zmm6, (rcx, r12, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 4),zmm31,zmm1 ) \
+    vmovupd( zmm1, (rcx, rsi, 4) )
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C_4 \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/\
+\
+    vfmadd231pd( mem(rcx, rsi, 1),zmm31,zmm4 ) \
+    vmovupd( zmm4, (rcx, rsi, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 2),zmm31,zmm2 ) \
+    vmovupd( zmm2, (rcx, rsi, 2) )\
+\
+    vfmadd231pd( mem(rcx, r12, 1),zmm31,zmm6 ) \
+    vmovupd( zmm6, (rcx, r12, 1) )
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C_3 \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/\
+\
+    vfmadd231pd( mem(rcx, rsi, 1),zmm31,zmm4 ) \
+    vmovupd( zmm4, (rcx, rsi, 1) )\
+\
+    vfmadd231pd( mem(rcx, rsi, 2),zmm31,zmm2 ) \
+    vmovupd( zmm2, (rcx, rsi, 2) )
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C_2 \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/\
+\
+    vfmadd231pd( mem(rcx, rsi, 1),zmm31,zmm4 ) \
+    vmovupd( zmm4, (rcx, rsi, 1) )
+
+/**
+ * Loads elements from C row, Scales it with Beta
+ * and adds FMA result to it.
+ * Stores back the C row.
+*/
+#define UPDATE_C_1 \
+\
+    vfmadd231pd( mem(rcx),zmm31,zmm0 )   /*Scale by Beta and add it to fma result*/ \
+    vmovupd( zmm0, (rcx) )            /*Stores back to C*/
+
+/* These kernels Assume that A matrix needs to be in col-major order
+ * B matrix can be col/row-major
+ * C matrix can be col/row-major
+ * Prefetch for C is done assuming that C is col-stored.
+ * Prefetch of B is done assuming that the matrix is col-stored.
+ * Prefetch for B and C matrices when row-stored is yet to be added.
+ * Prefetch of A matrix is not done in edge-case kernels.
+ */
+
+void bli_dgemmsup_rv_zen4_asm_24x8
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm28, zmm28, zmm28)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm29, zmm29, zmm29)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm26, zmm26, zmm26)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm27,zmm27, zmm27)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm15, zmm15, zmm15)
+    vxorpd(zmm24, zmm24, zmm24)
+    vxorpd(zmm16, zmm16, zmm16)
+    vxorpd(zmm17, zmm17, zmm17)
+    vxorpd(zmm25, zmm25, zmm25)
+    vxorpd(zmm18, zmm18, zmm18)
+    vxorpd(zmm19, zmm19, zmm19)
+    vxorpd(zmm22, zmm22, zmm22)
+    vxorpd(zmm20, zmm20, zmm20)
+    vxorpd(zmm21,zmm21, zmm21)
+    vxorpd(zmm23, zmm23, zmm23)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 8+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 8
+
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(8), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 3
+        prefetchw0( mem(rdx, 128))                        // prefetch C
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 8
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 )
+        vmovupd( 0x80(rax),zmm5 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+
+        // ---------------------------------- iteration 8
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vfmadd231pd( zmm5,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vfmadd231pd( zmm5,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vfmadd231pd( zmm5,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vfmadd231pd( zmm5,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vfmadd231pd( zmm5,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vfmadd231pd( zmm5,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        vfmadd231pd( zmm5,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        vfmadd231pd( zmm5,zmm31,zmm23 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 )
+        vmovupd( 0x80(rax),zmm2 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vfmadd231pd( zmm2,zmm30,zmm28 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vfmadd231pd( zmm2,zmm31,zmm29 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vfmadd231pd( zmm2,zmm30,zmm26 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vfmadd231pd( zmm2,zmm31,zmm27 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vfmadd231pd( zmm2,zmm30,zmm24 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vfmadd231pd( zmm2,zmm31,zmm25 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        vfmadd231pd( zmm2,zmm30,zmm22 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        vfmadd231pd( zmm2,zmm31,zmm23 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm28,zmm28 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm29,zmm29 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm26,zmm26 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm27,zmm27 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm15,zmm15 )
+    vmulpd( zmm30,zmm24,zmm24 )
+    vmulpd( zmm30,zmm16,zmm16 )
+    vmulpd( zmm30,zmm17,zmm17 )
+    vmulpd( zmm30,zmm25,zmm25 )
+    vmulpd( zmm30,zmm18,zmm18 )
+    vmulpd( zmm30,zmm19,zmm19 )
+    vmulpd( zmm30,zmm22,zmm22 )
+    vmulpd( zmm30,zmm20,zmm20 )
+    vmulpd( zmm30,zmm21,zmm21 )
+    vmulpd( zmm30,zmm23,zmm23 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( 0x80(rcx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm28)
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( 0x80(rcx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm29)
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( 0x80(rcx,rdi,2),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm26)
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( 0x80(rcx,r13,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm27)
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx))
+    vmovupd( 0x40(rdx),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm15)
+    vmovupd( zmm15,0x40(rdx))
+    vmovupd( 0x80(rdx),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm24)
+    vmovupd( zmm24,0x80(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( 0x40(rdx,rdi,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm17)
+    vmovupd( zmm17,0x40(rdx,rdi,1))
+    vmovupd( 0x80(rdx,rdi,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm25)
+    vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm18)
+    vmovupd( zmm18,(rdx,rdi,2))
+    vmovupd( 0x40(rdx,rdi,2),zmm1)
+    vfmadd231pd( zmm1,zmm31,zmm19)
+    vmovupd( zmm19,0x40(rdx,rdi,2))
+    vmovupd( 0x80(rdx,rdi,2),zmm2 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm2,zmm31,zmm22)
+    vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm20)
+    vmovupd( zmm20,(rdx,r13,1))
+    vmovupd( 0x40(rdx,r13,1),zmm4)
+    vfmadd231pd( zmm4,zmm31,zmm21)
+    vmovupd( zmm21,0x40(rdx,r13,1))
+    vmovupd( 0x80(rdx,r13,1),zmm5 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm5,zmm31,zmm23)
+    vmovupd( zmm23,0x80(rdx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_C_8
+    //First 8x8 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_C_8
+    //Second 8x8 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(25, 24, 0, 1, 23, 22, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //Third 7x8 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx))
+    vmovupd( zmm28,0x80(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1))
+    vmovupd( zmm29,0x80(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2))
+    vmovupd( zmm26,0x80(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1))
+    vmovupd( zmm27,0x80(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx))
+    vmovupd( zmm15,0x40(rdx))
+    vmovupd( zmm24,0x80(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( zmm17,0x40(rdx,rdi,1))
+    vmovupd( zmm25,0x80(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm18,(rdx,rdi,2))
+    vmovupd( zmm19,0x40(rdx,rdi,2))
+    vmovupd( zmm22,0x80(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm20,(rdx,r13,1))
+    vmovupd( zmm21,0x40(rdx,r13,1))
+    vmovupd( zmm23,0x80(rdx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_C_8_BZ
+    //First 8x8 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    UPDATE_C_8_BZ
+    //Second 8x8 tile updated
+
+    UNPACK_LO_HIGH(29, 28, 0, 1, 27, 26, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(25, 24, 0, 1, 23, 22, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(16), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_16x8
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm7, zmm7, zmm7)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm9, zmm9, zmm9)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm11, zmm11, zmm11)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm13, zmm13, zmm13)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm15, zmm15, zmm15)
+    vxorpd(zmm16, zmm16, zmm16)
+    vxorpd(zmm17, zmm17, zmm17)
+    vxorpd(zmm18, zmm18, zmm18)
+    vxorpd(zmm19, zmm19, zmm19)
+    vxorpd(zmm20, zmm20, zmm20)
+    vxorpd(zmm21,zmm21, zmm21)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 8+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 8
+
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(8), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 2
+        prefetchw0( mem(rdx, 64))                          // prefetch C
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 8
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 )                           // load A
+        vmovupd( 0x40(rax),zmm4 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+
+        // ---------------------------------- iteration 8
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vfmadd231pd( zmm4,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vfmadd231pd( zmm4,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vfmadd231pd( zmm4,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vfmadd231pd( zmm4,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vfmadd231pd( zmm4,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vfmadd231pd( zmm4,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        vfmadd231pd( zmm4,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        vfmadd231pd( zmm4,zmm31,zmm21 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 )                           // load A
+        vmovupd( 0x40(rax),zmm1 MASK_KZ(2) )     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vfmadd231pd( zmm1,zmm30,zmm7 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vfmadd231pd( zmm1,zmm31,zmm9 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vfmadd231pd( zmm1,zmm30,zmm11 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vfmadd231pd( zmm1,zmm31,zmm13 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vfmadd231pd( zmm1,zmm30,zmm15 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vfmadd231pd( zmm1,zmm31,zmm17 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        vfmadd231pd( zmm1,zmm30,zmm19 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        vfmadd231pd( zmm1,zmm31,zmm21 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm7,zmm7 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm9,zmm9 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm11,zmm11 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm13,zmm13 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm15,zmm15 )
+    vmulpd( zmm30,zmm16,zmm16 )
+    vmulpd( zmm30,zmm17,zmm17 )
+    vmulpd( zmm30,zmm18,zmm18 )
+    vmulpd( zmm30,zmm19,zmm19 )
+    vmulpd( zmm30,zmm22,zmm22 )
+    vmulpd( zmm30,zmm20,zmm20 )
+    vmulpd( zmm30,zmm21,zmm21 )
+    vmulpd( zmm30,zmm23,zmm23 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx))
+    vmovupd( 0x40(rcx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm7)
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( 0x40(rcx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm9)
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( 0x40(rcx,rdi,2),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm11)
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( 0x40(rcx,r13,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm13)
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx))
+    vmovupd( 0x40(rdx),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm15)
+    vmovupd( zmm15,0x40(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( 0x40(rdx,rdi,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm17)
+    vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,2),zmm0)
+    vfmadd231pd( zmm0,zmm31,zmm18)
+    vmovupd( zmm18,(rdx,rdi,2))
+    vmovupd( 0x40(rdx,rdi,2),zmm1 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm1,zmm31,zmm19)
+    vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,r13,1),zmm3)
+    vfmadd231pd( zmm3,zmm31,zmm20)
+    vmovupd( zmm20,(rdx,r13,1))
+    vmovupd( 0x40(rdx,r13,1),zmm4 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm4,zmm31,zmm21)
+    vmovupd( zmm21,0x40(rdx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+    UPDATE_C_8
+    //First 8x8 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //7x8 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx))
+    vmovupd( zmm7,0x40(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1))
+    vmovupd( zmm9,0x40(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2))
+    vmovupd( zmm11,0x40(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1))
+    vmovupd( zmm13,0x40(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx))
+    vmovupd( zmm15,0x40(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1))
+    vmovupd( zmm17,0x40(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm18,(rdx,rdi,2))
+    vmovupd( zmm19,0x40(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm20,(rdx,r13,1))
+    vmovupd( zmm21,0x40(rdx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    UPDATE_C_8_BZ
+    //First 8x8 tile updated
+
+    UNPACK_LO_HIGH(9, 7, 0, 1, 13, 11, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 7, 9)
+
+    UNPACK_LO_HIGH(17, 15, 0, 1, 21, 19, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 7, 4, 5, 12, 9, 6, 8)
+
+    mov(var(m0), rdi)
+    sub(imm(8), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
+
+
+void bli_dgemmsup_rv_zen4_asm_8x8
+(
+       conj_t    conja,
+       conj_t    conjb,
+       dim_t     m0,
+       dim_t     n0,
+       dim_t     k0,
+       double*    restrict alpha,
+       double*    restrict a, inc_t rs_a0, inc_t cs_a0,
+       double*    restrict b, inc_t rs_b0, inc_t cs_b0,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+    AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_7);
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t rs_a   = rs_a0;
+    uint64_t cs_a   = cs_a0;
+    uint64_t rs_b   = rs_b0;
+    uint64_t cs_b   = cs_b0;
+    uint64_t rs_c   = rs_c0;
+    uint64_t cs_c   = cs_c0;
+
+    uint64_t ps_a = bli_auxinfo_ps_a( data );
+    uint64_t ps_a8  = ps_a * sizeof( double );
+
+    uint64_t k_iter = (uint64_t)k0 / 8;
+    uint64_t k_left = (uint64_t)k0 % 8;
+
+    /* 8 double precision elements can be loaded into a 512-bit register
+     * So, we use an 8-bit mask to specify which elements to be loaded/stored
+     * into/from the register. m_left % 8 specifies how many number of elements
+     *  are to be loaded/stored into/from the last register.
+     * For example, if m_left = 19, m0 & 7 becomes 3 which indicates that 3 elements
+     * have to be loaded/stored into/from register, so shift 0xff(11111111) by (8-3)
+     *  times to the right which makes the mask to be (00000111)
+     */
+    uint8_t mask = 0xff >> (0x8 - (m0 & 7)); // calculate mask based on m_left
+    // For special cases where m_left = 24/16/8, all 8 elements have to be loaded/stored
+    // So, mask becomes 0xff(11111111)
+    if (mask == 0) mask = 0xff;
+
+        // -------------------------------------------------------------------------
+    begin_asm()
+
+    mov(var(a), rax)                // load address of a
+    mov(var(cs_a), r10)             // load cs_a
+    mov(var(b), rbx)                // load address of b
+    mov(var(rs_b), r8)              // load rs_b
+    mov(var(cs_b), r9)              // load cs_b
+    mov(var(c), rcx)                // load address of c
+    mov(var(cs_c), rdi)             // load cs_c
+    mov(var(mask), rdx)             // load mask
+    kmovw(edx, k(2))                // move mask to k2 register
+    lea(mem(, r8, 8), r8)           // rs_b *= sizeof(double)
+    lea(mem(, r9, 8), r9)           // cs_b *= sizeof(double)
+    lea(mem(, r10, 8), r10)         // cs_a *= sizeof(double)
+    lea(mem(, rdi, 8), rdi)         // cs_c *= sizeof(double)
+    lea(mem(r9, r9, 2 ), r13)       // r13 = 3*cs_b
+    // if n > 4, a second pointer(r12) which points to rbx + 4*cs_b
+    //is also used to traverse B matrix
+    lea(mem(rbx, r9, 4), r12)       // r12 = rbx + 4*cs_b
+    lea(mem(rcx, 7*8), rdx)         // C for prefetching
+    lea(mem(rbx, r8, 8, 7*8), r11)  // r11 = rbx + 8*rs_b(B for prefetching)
+    // if n > 4, a second pointer which point to r11 + 4*cs_b
+    //is also used to prefetch from B matrix
+    lea(mem(r11, r9, 4), r15)       // r15 = r11 + 4* cs_b(B for prefetching)
+
+    /* Register usage: zmm0-5 are used to load A matrix
+     *                 zmm6-29 are used for accumulation
+     *                 zmm30-31 are used for broadcasting B matrix
+     */
+
+    // zero out all accumulation registers
+    vxorpd(zmm6, zmm6, zmm6)
+    vxorpd(zmm8, zmm8, zmm8)
+    vxorpd(zmm10, zmm10, zmm10)
+    vxorpd(zmm12, zmm12, zmm12)
+    vxorpd(zmm14, zmm14, zmm14)
+    vxorpd(zmm16, zmm16, zmm16)
+    vxorpd(zmm18, zmm18, zmm18)
+    vxorpd(zmm20, zmm20, zmm20)
+
+    // K is unrolled by 8 to facilitate prefetch of B
+    // Assuming B to be col-stored, for each iteration of K,
+    //one cacheline of B_next is prefetched where b_next = b + (unroll)*rs_b
+    label(.DLOOPKITER)                                     // main loop
+    mov(var(k_iter), rsi)                                  // i = k_iter
+    sub(imm( 8+TAIL_NITER), rsi)                           // i -= NR + TAIL_NITER
+    jle(.PREFETCHLOOP)                                     // jump if i <= 0
+
+    label(.LOOP1)
+
+        // ---------------------------------- iteration 1
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 2
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 3
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 4
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 5
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 6
+
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 7
+
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 8
+
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer to b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP1)                                            // iterate again if i != 0.
+
+    label(.PREFETCHLOOP)
+    add(imm(8), rsi)                                       // i += NR
+    jle(.TAILITER)                                         // jump if i <= 0.
+
+    label(.LOOP2)
+
+        // ---------------------------------- iteration 1
+        prefetchw0( mem(rdx))                              // prefetch C
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 8
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        lea(mem(rdx, rdi, 1), rdx)                         // C += cs_c
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // second pointer of b_next += 8*rs_b
+        sub(imm(1), rsi)                                   // i -= 1
+    jnz(.LOOP2)                                            // iterate again if i != 0.
+    label(.TAILITER)
+    add(imm(TAIL_NITER), rsi)                              // i += TAIL_NITER
+    jle(.TAIL)                                             // jump if i <= 0
+
+    label(.LOOP3)
+
+        // ---------------------------------- iteration 1
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 2
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 3
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 4
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r11,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 5
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 6
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 7
+        vmovupd( mem(rax),zmm3 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        prefetch( 0,mem(r15,r9,2) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+
+        // ---------------------------------- iteration 8
+        prefetch( 0,mem(r15,r13,1) )                             // prefetch B
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm3,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm3,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm3,zmm31,zmm20 )
+        lea(mem(r11,r8,8), r11)                            // b_next += 8*rs_b
+        lea(mem(r15,r8,8), r15)                            // Second pointer of b_next += 8*rs_b
+        dec(rsi)                                           // i -= 1
+    jnz(.LOOP3)                                            // iterate again if i != 0.
+
+
+    label(.TAIL)
+    mov(var(k_left), rsi)                                  // i = k_left
+    test(rsi, rsi)                                         // check i via logical AND
+    je(.DPOSTACCUM)                                        // if i == 0, jump to post-accumulation
+
+    label(.DLOOPKLEFT)                                     // k_left loop
+        vmovupd( mem(rax),zmm0 MASK_KZ(2) )                           // load A     // Load A with mask and zero hint
+        add( r10,rax )                                     // a += cs_a
+        vbroadcastsd( mem(rbx),zmm30 )
+        vbroadcastsd( mem(rbx,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm6 )
+        vbroadcastsd( mem(rbx,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm8 )
+        vbroadcastsd( mem(rbx,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm10 )
+        vbroadcastsd( mem(r12),zmm30 )
+        add( r8,rbx )                                     // b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm12 )
+        vbroadcastsd( mem(r12,r9,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm14 )
+        vbroadcastsd( mem(r12,r9,2),zmm30 )
+        vfmadd231pd( zmm0,zmm31,zmm16 )
+        vbroadcastsd( mem(r12,r13,1),zmm31 )
+        vfmadd231pd( zmm0,zmm30,zmm18 )
+        add( r8,r12 )                                     // second pointer of b += rs_b
+        vfmadd231pd( zmm0,zmm31,zmm20 )
+        dec(rsi)                                           // i -= 1
+    jne(.DLOOPKLEFT)                                       // iterate again if i != 0.
+
+
+    label(.DPOSTACCUM)
+    mov(var(alpha), rdx)                                   // load address of alpha
+    vbroadcastsd(mem(rdx), zmm30)                           // broadcast alpha
+    mov(var(beta), rax)                                    // load address of beta
+    vbroadcastsd(mem(rax), zmm31)                           // broadcast beta
+
+    // scale by alpha
+    vmulpd( zmm30,zmm6,zmm6 )
+    vmulpd( zmm30,zmm8,zmm8 )
+    vmulpd( zmm30,zmm10,zmm10 )
+    vmulpd( zmm30,zmm12,zmm12 )
+    vmulpd( zmm30,zmm14,zmm14 )
+    vmulpd( zmm30,zmm16,zmm16 )
+    vmulpd( zmm30,zmm18,zmm18 )
+    vmulpd( zmm30,zmm20,zmm20 )
+
+
+    mov(var(rs_c), rsi)                                    // load rs_c
+    lea(mem(, rsi, 8), rsi)                                // rsi = rs_c * sizeof(double)
+    lea(mem(rcx, rdi, 4), rdx)                             // rdx = rcx + 4 * cs_c
+    lea(mem(rdi, rdi, 2), r13)                             // r13 = 3*cs_c
+    vxorpd(ymm2, ymm2, ymm2)
+    vucomisd(xmm2, xmm31)                                   // set ZF if beta == 0
+    je(.DBETAZERO)                                         // if ZF == 1, jump to beta == 0 case
+
+
+    cmp(imm(8), rdi)                                       // set ZF if (8*cs_c) == 8
+
+
+    jz(.DROWSTORED)                                        // jump to row storage case
+
+    label(.DCOLSTORED)
+    vmovupd( mem(rcx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm6)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm8)
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,rdi,2),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm10)
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rcx,r13,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm12)
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm14)
+    vmovupd( zmm14,(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm16)
+    vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,rdi,2),zmm0 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm0,zmm31,zmm18)
+    vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( mem(rdx,r13,1),zmm3 MASK_KZ(2))        // Load C using mask and zero hint
+    vfmadd231pd( zmm3,zmm31,zmm20)
+    vmovupd( zmm20,(rdx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                           // jump to end.
+
+    label(.DROWSTORED)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    vbroadcastsd(mem(rax), zmm31)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0)
+
+    LABEL(.UPDATE8)
+    UPDATE_C_8
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7)
+    UPDATE_C_7
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6)
+    UPDATE_C_6
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5)
+    UPDATE_C_5
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4)
+    UPDATE_C_4
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3)
+    UPDATE_C_3
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2)
+    UPDATE_C_2
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1)
+    UPDATE_C_1
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0)
+    //7x8 tile updated
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DBETAZERO)
+    cmp(imm(8), rdi)                                     // set ZF if (8*cs_c) == 8
+
+    jz(.DROWSTORBZ)                                      // jump to row storage case
+    label(.DCOLSTORBZ)
+    vmovupd( zmm6,(rcx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm8,(rcx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm10,(rcx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm12,(rcx,r13,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm14,(rdx) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm16,(rdx,rdi,1) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm18,(rdx,rdi,2) MASK_(k(2)))                // store to C with mask
+    vmovupd( zmm20,(rdx,r13,1) MASK_(k(2)))                // store to C with mask
+
+    jmp(.DDONE)                                          // jump to end.
+
+
+    label(.DROWSTORBZ)
+    // rdx = 3*rs_c
+    lea(mem(rsi,  rsi,  2), r12)
+    // rdx = 5*rs_c
+    lea(mem(r12, rsi,  2), r13)
+    // rdx = 7*rs_c
+    lea(mem(r12, rsi,  4), rdx)
+    lea(mem(   , rsi, 8), r14)
+    UNPACK_LO_HIGH(8, 6, 0, 1, 12, 10, 2, 3)
+    SHUFFLE_DATA(2, 0, 4, 5, 3, 1, 30, 31)
+
+    UNPACK_LO_HIGH(16, 14, 0, 1, 20, 18, 2, 3)
+    SHUFFLE_DATA(2, 0, 6, 8, 3, 1, 10, 12)
+
+    SHUFFLE_DATA(6, 4, 0, 1, 8, 5, 2, 3)
+    SHUFFLE_DATA(10, 30, 4, 5, 12, 31, 6, 8)
+
+    mov(var(m0), rdi)
+    cmp(imm(8), rdi)
+    JZ(.UPDATE8BZ)
+    cmp(imm(7), rdi)
+    JZ(.UPDATE7BZ)
+    cmp(imm(6), rdi)
+    JZ(.UPDATE6BZ)
+    cmp(imm(5), rdi)
+    JZ(.UPDATE5BZ)
+    cmp(imm(4), rdi)
+    JZ(.UPDATE4BZ)
+    cmp(imm(3), rdi)
+    JZ(.UPDATE3BZ)
+    cmp(imm(2), rdi)
+    JZ(.UPDATE2BZ)
+    cmp(imm(1), rdi)
+    JZ(.UPDATE1BZ)
+    cmp(imm(0), rdi)
+    JZ(.UPDATE0BZ)
+
+    LABEL(.UPDATE8BZ)
+    UPDATE_C_8_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE7BZ)
+    UPDATE_C_7_BZ
+    jmp(.DDONE)
+
+    LABEL(.UPDATE6BZ)
+    UPDATE_C_6_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE5BZ)
+    UPDATE_C_5_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE4BZ)
+    UPDATE_C_4_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE3BZ)
+    UPDATE_C_3_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE2BZ)
+    UPDATE_C_2_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE1BZ)
+    UPDATE_C_1_BZ
+    jmp(.DDONE)                                              // jump to end.
+
+    LABEL(.UPDATE0BZ)
+    label(.DDONE)
+
+
+    vzeroupper()
+
+    end_asm(
+      : // output operands (none)
+      : // input operands
+        [k_iter] "m" (k_iter),
+        [k_left] "m" (k_left),
+        [a]      "m" (a),
+        [rs_a]   "m" (rs_a),
+        [cs_a]   "m" (cs_a),
+        [ps_a8]  "m" (ps_a8),
+        [b]      "m" (b),
+        [rs_b]   "m" (rs_b),
+        [cs_b]   "m" (cs_b),
+        [alpha]  "m" (alpha),
+        [beta]   "m" (beta),
+        [c]      "m" (c),
+        [rs_c]   "m" (rs_c),
+        [cs_c]   "m" (cs_c),
+        [n0]     "m" (n0),
+        [m0]     "m" (m0),
+        [mask]   "m" (mask)
+      : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
+        "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+        "xmm2", "xmm31",
+        "ymm2",
+        "zmm0", "zmm1", "zmm2", "zmm3",
+        "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
+        "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
+        "zmm16", "zmm17", "zmm18", "zmm19",
+        "zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26",
+        "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
+        "k2", "memory"
+    )
+    AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_7);
+}
diff --git a/kernels/zen4/CMakeLists.txt b/kernels/zen4/CMakeLists.txt
index c22c5ba143..7878918053 100644
--- a/kernels/zen4/CMakeLists.txt
+++ b/kernels/zen4/CMakeLists.txt
@@ -1,6 +1,7 @@
-##Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.##
+##Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.##
+remove_definitions(/arch:AVX2)
 
 add_subdirectory(1)
+add_subdirectory(1m)
 add_subdirectory(3)
-
-
+add_subdirectory(aocl_smart)
\ No newline at end of file
diff --git a/kernels/zen4/README b/kernels/zen4/README
deleted file mode 100644
index c9e16c2735..0000000000
--- a/kernels/zen4/README
+++ /dev/null
@@ -1 +0,0 @@
-Currently there are no zen4 specific kernels, however, this folder is required for the the build system.
diff --git a/kernels/zen4/aocl_smart/CMakeLists.txt b/kernels/zen4/aocl_smart/CMakeLists.txt
new file mode 100644
index 0000000000..ef10975d24
--- /dev/null
+++ b/kernels/zen4/aocl_smart/CMakeLists.txt
@@ -0,0 +1,6 @@
+##Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.##
+
+target_sources("${PROJECT_NAME}"
+     PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/bli_aocl_smart.c
+    )
diff --git a/kernels/zen4/aocl_smart/bli_aocl_smart.c b/kernels/zen4/aocl_smart/bli_aocl_smart.c
new file mode 100644
index 0000000000..96e45b7139
--- /dev/null
+++ b/kernels/zen4/aocl_smart/bli_aocl_smart.c
@@ -0,0 +1,71 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+/* This function determines if we need to take SUP or native path
+   for given matrix sizes for zen4 configuration.
+   * Returns TRUE if the dimensions fall under SUP range
+   * Returns FALSE if the dimensions fall under Native range
+*/
+bool bli_cntx_gemmsup_thresh_is_met_zen4( obj_t* a, obj_t* b, obj_t* c, cntx_t* cntx )
+{
+	num_t       dt          =   bli_obj_dt( c );
+
+	if( dt == BLIS_DOUBLE )
+	{
+		dim_t k           =   bli_obj_width_after_trans( a );
+		dim_t m, n;
+
+		const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
+
+		if ( bli_cntx_l3_sup_ker_dislikes_storage_of( c, stor_id, cntx ) )
+		{
+			m = bli_obj_width(c);
+			n = bli_obj_length(c);
+		}
+		else
+		{
+			m = bli_obj_length( c );
+			n = bli_obj_width( c );
+		}
+		// For skinny sizes where one/two dimensions are small
+		if((m < 1000) || (n < 1000)) return TRUE;
+		// For all combinations in small sizes
+		if((m < 5000) && (n < 5000) && (k < 5000)) return TRUE;
+		return FALSE;
+	}
+	else
+		return bli_cntx_l3_sup_thresh_is_met( a, b, c, cntx );
+}
diff --git a/kernels/zen4/bli_kernels_zen4.h b/kernels/zen4/bli_kernels_zen4.h
index e518a86047..701e2ecb49 100644
--- a/kernels/zen4/bli_kernels_zen4.h
+++ b/kernels/zen4/bli_kernels_zen4.h
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -38,5 +38,172 @@
 AMAXV_KER_PROT( float,    s, amaxv_zen_int_avx512 )
 AMAXV_KER_PROT( double,   d, amaxv_zen_int_avx512 )
 
+// scalv (AVX512 intrinsics)
+SCALV_KER_PROT( float,   s, scalv_zen_int_avx512 )
+SCALV_KER_PROT( double,  d, scalv_zen_int_avx512 )
+
+// dotv (intrinsics)
+DOTV_KER_PROT( float,    s, dotv_zen_int_avx512 )
+DOTV_KER_PROT( double,   d, dotv_zen_int_avx512 )
+
+// axpyv (intrinsics)
+AXPYV_KER_PROT( float,    s, axpyv_zen_int_avx512 )
+AXPYV_KER_PROT( double,   d, axpyv_zen_int_avx512 )
+
 GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_l_zen_asm_16x14)
-GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_zen_asm_16x14)
\ No newline at end of file
+GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_zen_asm_16x14)
+GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_l_zen4_asm_8x24)
+GEMMTRSM_UKR_PROT( double,   d, gemmtrsm_u_zen4_asm_8x24)
+
+//packing kernels
+PACKM_KER_PROT( double,   d, packm_zen4_asm_16xk )
+PACKM_KER_PROT( double,   d, packm_zen4_asm_8xk )
+PACKM_KER_PROT( double,   d, packm_zen4_asm_24xk )
+PACKM_KER_PROT( double,   d, packm_zen4_asm_32xk )
+PACKM_KER_PROT( double,   d, packm_32xk_zen4_ref )
+PACKM_KER_PROT( dcomplex, z, packm_zen4_asm_12xk )
+PACKM_KER_PROT( dcomplex, z, packm_zen4_asm_4xk )
+
+// native dgemm kernel
+GEMM_UKR_PROT( double,   d, gemm_zen4_asm_32x6 )
+GEMM_UKR_PROT( double,   d, gemm_zen4_asm_8x24 )
+GEMM_UKR_PROT( dcomplex, z, gemm_zen4_asm_12x4 )
+
+//sgemm rv sup
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x64m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x48m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x32m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x16m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x64m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x48m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x32m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x16m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x64m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x48m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x32m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x16m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x64m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x48m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x32m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x16m_avx512 )
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_6x64n_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x64n_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_4x64n_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x64n_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_2x64n_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_1x64n_avx512 )
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x48_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x32_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_5x16_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x48_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x32_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rv_zen_asm_3x16_avx512 )
+
+// sgemm rd sup
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x64m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x48m_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x32m_avx512 )
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_3x64n_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x64n_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_6x64n_avx512 )
+
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_5x64_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_4x64_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_3x64_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x64_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_1x64_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_5x48_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_4x48_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_3x48_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x48_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_1x48_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_5x32_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_4x32_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_3x32_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_2x32_avx512 )
+GEMMSUP_KER_PROT( float,   s, gemmsup_rd_zen_asm_1x32_avx512 )
+
+TRSMSMALL_PROT(trsm_small_AVX512)
+TRSMSMALL_KER_PROT( d, trsm_small_AutXB_AlXB_AVX512 )
+TRSMSMALL_KER_PROT( d, trsm_small_XAltB_XAuB_AVX512 )
+TRSMSMALL_KER_PROT( d, trsm_small_XAutB_XAlB_AVX512 )
+TRSMSMALL_KER_PROT( d, trsm_small_AltXB_AuXB_AVX512 )
+
+#ifdef BLIS_ENABLE_OPENMP
+TRSMSMALL_PROT(trsm_small_mt_AVX512)
+#endif
+
+// Dgemm sup RV kernels
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x8m)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x7m)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x6m)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x5m)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x4m)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x3m)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x2m)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x1m)
+
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x8)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_16x8)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_8x8)
+
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x7)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_16x7)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_8x7)
+
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x6)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_16x6)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_8x6)
+
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x5)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_16x5)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_8x5)
+
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x4)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_16x4)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_8x4)
+
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x3)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_16x3)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_8x3)
+
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x2)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_16x2)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_8x2)
+
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_24x1)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_16x1)
+GEMMSUP_KER_PROT( double,  d, gemmsup_rv_zen4_asm_8x1)
+
+// Zgemm sup CV kernels
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_12x4m )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_12x3m )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_12x2m )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_12x1m )
+
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_8x4 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_8x3 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_8x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_8x1 )
+
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_4x4 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_4x3 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_4x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_4x1 )
+
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_2x4 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_2x3 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_2x2 )
+GEMMSUP_KER_PROT( dcomplex,   z, gemmsup_cv_zen4_asm_2x1 )
+
+// threshold functions
+bool bli_cntx_gemmsup_thresh_is_met_zen4
+	 (
+		obj_t*  a,
+		obj_t*  b,
+		obj_t*  c,
+		cntx_t* cntx
+	 );
diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c
new file mode 100644
index 0000000000..592af7f042
--- /dev/null
+++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_6x64rowmajor_bf16_amd512vnni.c
@@ -0,0 +1,1609 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS dim_tERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_f32_kern_macros.h"
+
+#ifdef LPGEMM_BF16_NOT_SUPPORTED
+
+// BF16 ISA is not supported by gcc < 10. Use a dummy kernel here.
+LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64)
+{}
+
+#else
+
+// 6x64 bf16 kernel
+LPGEMM_MAIN_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x64_DISABLE,
+						  &&POST_OPS_BIAS_6x64,
+						  &&POST_OPS_RELU_6x64,
+						  &&POST_OPS_RELU_SCALE_6x64,
+						  &&POST_OPS_GELU_TANH_6x64,
+						  &&POST_OPS_GELU_ERF_6x64,
+						  &&POST_OPS_CLIP_6x64,
+						  &&POST_OPS_DOWNSCALE_6x64
+						};
+	dim_t MR = 6;
+	dim_t NR = 64;
+
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	if ( n0 < NR )
+	{
+		dim_t n0_rem = n0 % 16;
+
+		// Split dim_to multiple smaller fringe kernels, so as to maximize
+		// vectorization. Any n0 < NR(64) can be expressed as n0 = 48 + n`
+		// or n0 = 32 + n` or n0 = 16 + n`, where n` < 16.
+		dim_t n0_48 = n0 / 48;
+		dim_t n0_32 = n0 / 32;
+		dim_t n0_16 = n0 / 16;
+
+		// KC when not multiple of 2 will have padding to make it multiple of
+		// 2 in packed buffer. Also the k0 cannot be passed as the updated
+		// value since A matrix is not packed and requires original k0.
+		dim_t k0_updated = k0;
+		k0_updated += (k0_updated & 0x1);
+
+		if ( n0_48 == 1 )
+		{
+			lpgemm_rowvar_bf16bf16f32of32_6x48
+				(
+				 m0, k0,
+				 a, rs_a, cs_a, ps_a,
+				 b, ( ( rs_b / 4 ) * 3 ), cs_b,
+				 c, rs_c,
+				 alpha, beta,
+			     post_ops_list, post_ops_attr
+				);
+
+			b = b + ( 48 * k0_updated ); // k0x48 packed contiguosly.
+			c = c + 48;
+			post_ops_attr.post_op_c_j += 48;
+		}
+
+		else if ( n0_32 == 1 )
+		{
+			lpgemm_rowvar_bf16bf16f32of32_6x32
+				(
+				 m0, k0,
+				 a, rs_a, cs_a, ps_a,
+				 b, ( ( rs_b / 4 ) * 2 ), cs_b,
+				 c, rs_c,
+				 alpha, beta,
+			     post_ops_list, post_ops_attr
+				);
+
+			b = b + ( 32 * k0_updated ); // k0x32 packed contiguosly.
+			c = c + 32;
+			post_ops_attr.post_op_c_j += 32;
+		}
+
+		else if ( n0_16 == 1 )
+		{
+			lpgemm_rowvar_bf16bf16f32of32_6x16
+				(
+				 m0, k0,
+				 a, rs_a, cs_a, ps_a,
+				 b, ( ( rs_b / 4 ) * 1 ), cs_b,
+				 c, rs_c,
+				 alpha, beta,
+			     post_ops_list, post_ops_attr
+				);
+
+			b = b + ( 16 * k0_updated ); // k0x16 packed contiguosly.
+			c = c + 16;
+			post_ops_attr.post_op_c_j += 16;
+		}
+
+		if ( n0_rem > 0 )
+		{
+			lpgemm_rowvar_bf16bf16f32of32_6xlt16
+				(
+				 m0, k0,
+				 a, rs_a, cs_a, ps_a,
+				 b, ( ( rs_b / 4 ) * 1 ), cs_b,
+				 c, rs_c,
+				 alpha, beta, n0_rem,
+			     post_ops_list, post_ops_attr
+				);
+
+			// No leftover fringe after this podint.
+		}
+		return;
+	}
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+	__m512bh b3;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+	__m512bh a_bf16_1;
+
+	dim_t value;
+
+	if(k_full_pieces > 40)
+	{
+		value = 40;
+	}
+	else
+	{
+		value = 0;
+	}
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512 c_float_0p0 = _mm512_setzero_ps();
+		__m512 c_float_0p1 = _mm512_setzero_ps();
+		__m512 c_float_0p2 = _mm512_setzero_ps();
+		__m512 c_float_0p3 = _mm512_setzero_ps();
+
+		__m512 c_float_1p0 = _mm512_setzero_ps();
+		__m512 c_float_1p1 = _mm512_setzero_ps();
+		__m512 c_float_1p2 = _mm512_setzero_ps();
+		__m512 c_float_1p3 = _mm512_setzero_ps();
+
+		__m512 c_float_2p0 = _mm512_setzero_ps();
+		__m512 c_float_2p1 = _mm512_setzero_ps();
+		__m512 c_float_2p2 = _mm512_setzero_ps();
+		__m512 c_float_2p3 = _mm512_setzero_ps();
+
+		__m512 c_float_3p0 = _mm512_setzero_ps();
+		__m512 c_float_3p1 = _mm512_setzero_ps();
+		__m512 c_float_3p2 = _mm512_setzero_ps();
+		__m512 c_float_3p3 = _mm512_setzero_ps();
+
+		__m512 c_float_4p0 = _mm512_setzero_ps();
+		__m512 c_float_4p1 = _mm512_setzero_ps();
+		__m512 c_float_4p2 = _mm512_setzero_ps();
+		__m512 c_float_4p3 = _mm512_setzero_ps();
+
+		__m512 c_float_5p0 = _mm512_setzero_ps();
+		__m512 c_float_5p1 = _mm512_setzero_ps();
+		__m512 c_float_5p2 = _mm512_setzero_ps();
+		__m512 c_float_5p3 = _mm512_setzero_ps();
+
+		for ( dim_t kr = 0; kr < k_full_pieces - value; kr += 1 )
+		{
+			// The instructions are arranged in a mixed way to reduce data
+			// chain dependencies.
+
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+2]
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )(a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+			b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+			b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_1 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+			c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+			c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+			c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+			c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_1 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+			c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+			c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
+			c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
+			c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_1 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+			c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
+			c_float_4p3 = _mm512_dpbf16_ps( c_float_4p3, a_bf16_0, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-63] = a[5,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_1, b0 );
+			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_1, b1 );
+			c_float_5p2 = _mm512_dpbf16_ps( c_float_5p2, a_bf16_1, b2 );
+			c_float_5p3 = _mm512_dpbf16_ps( c_float_5p3, a_bf16_1, b3 );
+		}
+
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (2 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (3 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (2 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (3 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (2 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (3 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (2 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (3 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (2 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (3 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (2 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (3 * 16), _MM_HINT_T1);
+
+		for (dim_t kr = k_full_pieces - value; kr < k_full_pieces; kr += 1)
+		{
+			// The instructions are arranged in a mixed way to reduce data
+			// chain dependencies.
+
+			b0 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 0));
+
+			// Broadcast a[0,kr:kr+2]
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 0) + (cs_a * kr)));
+
+			b1 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 1));
+			b2 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 2));
+			b3 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 3));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_0p0 = _mm512_dpbf16_ps(c_float_0p0, a_bf16_0, b0);
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_1 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 1) + (cs_a * kr)));
+
+			c_float_0p1 = _mm512_dpbf16_ps(c_float_0p1, a_bf16_0, b1);
+			c_float_0p2 = _mm512_dpbf16_ps(c_float_0p2, a_bf16_0, b2);
+			c_float_0p3 = _mm512_dpbf16_ps(c_float_0p3, a_bf16_0, b3);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_1p0 = _mm512_dpbf16_ps(c_float_1p0, a_bf16_1, b0);
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 2) + (cs_a * kr)));
+
+			c_float_1p1 = _mm512_dpbf16_ps(c_float_1p1, a_bf16_1, b1);
+			c_float_1p2 = _mm512_dpbf16_ps(c_float_1p2, a_bf16_1, b2);
+			c_float_1p3 = _mm512_dpbf16_ps(c_float_1p3, a_bf16_1, b3);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_2p0 = _mm512_dpbf16_ps(c_float_2p0, a_bf16_0, b0);
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_1 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 3) + (cs_a * kr)));
+
+			c_float_2p1 = _mm512_dpbf16_ps(c_float_2p1, a_bf16_0, b1);
+			c_float_2p2 = _mm512_dpbf16_ps(c_float_2p2, a_bf16_0, b2);
+			c_float_2p3 = _mm512_dpbf16_ps(c_float_2p3, a_bf16_0, b3);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_3p0 = _mm512_dpbf16_ps(c_float_3p0, a_bf16_1, b0);
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 4) + (cs_a * kr)));
+
+			c_float_3p1 = _mm512_dpbf16_ps(c_float_3p1, a_bf16_1, b1);
+			c_float_3p2 = _mm512_dpbf16_ps(c_float_3p2, a_bf16_1, b2);
+			c_float_3p3 = _mm512_dpbf16_ps(c_float_3p3, a_bf16_1, b3);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_4p0 = _mm512_dpbf16_ps(c_float_4p0, a_bf16_0, b0);
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_1 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 5) + (cs_a * kr)));
+
+			c_float_4p1 = _mm512_dpbf16_ps(c_float_4p1, a_bf16_0, b1);
+			c_float_4p2 = _mm512_dpbf16_ps(c_float_4p2, a_bf16_0, b2);
+			c_float_4p3 = _mm512_dpbf16_ps(c_float_4p3, a_bf16_0, b3);
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-63] = a[5,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_5p0 = _mm512_dpbf16_ps(c_float_5p0, a_bf16_1, b0);
+			c_float_5p1 = _mm512_dpbf16_ps(c_float_5p1, a_bf16_1, b1);
+			c_float_5p2 = _mm512_dpbf16_ps(c_float_5p2, a_bf16_1, b2);
+			c_float_5p3 = _mm512_dpbf16_ps(c_float_5p3, a_bf16_1, b3);
+		}
+
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+2].
+			a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+			b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+			b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+			a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+			c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+			c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+			c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+			c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+			a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+			c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+			c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
+			c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
+			c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces )));
+			a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+			c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
+			c_float_4p3 = _mm512_dpbf16_ps( c_float_4p3, a_bf16_0, b3 );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-63] = a[5,kr:kr+2]*b[kr:kr+2,0-63]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_1, b0 );
+			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_1, b1 );
+			c_float_5p2 = _mm512_dpbf16_ps( c_float_5p2, a_bf16_1, b2 );
+			c_float_5p3 = _mm512_dpbf16_ps( c_float_5p3, a_bf16_1, b3 );
+		}
+
+		// Load alpha and beta
+		__m512 selector1 = _mm512_set1_ps ( alpha );
+		__m512 selector2 = _mm512_set1_ps ( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+			c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+			c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+			c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
+
+			c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+			c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+			c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+			c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
+
+			c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+			c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+			c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
+			c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 );
+
+			c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+			c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+			c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
+			c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 );
+
+			c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+			c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
+			c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 );
+			c_float_4p3 = _mm512_mul_ps( selector1, c_float_4p3 );
+
+			c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
+			c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 );
+			c_float_5p2 = _mm512_mul_ps( selector1, c_float_5p2 );
+			c_float_5p3 = _mm512_mul_ps( selector1, c_float_5p3 );
+
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			// For the downscaled api (C-bf16), the output C matrix values
+			// needs to be upscaled to float to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				BF16_F32_BETA_OP(c_float_0p0,ir,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				BF16_F32_BETA_OP(c_float_0p1,ir,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				BF16_F32_BETA_OP(c_float_0p2,ir,0,2,selector1,selector2)
+
+				// c[0,48-63]
+				BF16_F32_BETA_OP(c_float_0p3,ir,0,3,selector1,selector2)
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP(c_float_1p0,ir,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				BF16_F32_BETA_OP(c_float_1p1,ir,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				BF16_F32_BETA_OP(c_float_1p2,ir,1,2,selector1,selector2)
+
+				// c[1,48-63]
+				BF16_F32_BETA_OP(c_float_1p3,ir,1,3,selector1,selector2)
+
+				// c[2,0-15]
+				BF16_F32_BETA_OP(c_float_2p0,ir,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				BF16_F32_BETA_OP(c_float_2p1,ir,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				BF16_F32_BETA_OP(c_float_2p2,ir,2,2,selector1,selector2)
+
+				// c[2,48-63]
+				BF16_F32_BETA_OP(c_float_2p3,ir,2,3,selector1,selector2)
+
+				// c[3,0-15]
+				BF16_F32_BETA_OP(c_float_3p0,ir,3,0,selector1,selector2)
+
+				// c[3,16-31]
+				BF16_F32_BETA_OP(c_float_3p1,ir,3,1,selector1,selector2)
+
+				// c[3,32-47]
+				BF16_F32_BETA_OP(c_float_3p2,ir,3,2,selector1,selector2)
+
+				// c[0,48-63]
+				BF16_F32_BETA_OP(c_float_3p3,ir,3,3,selector1,selector2)
+
+				// c[4,0-15]
+				BF16_F32_BETA_OP(c_float_4p0,ir,4,0,selector1,selector2)
+
+				// c[4,16-31]
+				BF16_F32_BETA_OP(c_float_4p1,ir,4,1,selector1,selector2)
+
+				// c[4,32-47]
+				BF16_F32_BETA_OP(c_float_4p2,ir,4,2,selector1,selector2)
+
+				// c[4,48-63]
+				BF16_F32_BETA_OP(c_float_4p3,ir,4,3,selector1,selector2)
+
+				// c[5,0-15]
+				BF16_F32_BETA_OP(c_float_5p0,ir,5,0,selector1,selector2)
+
+				// c[5,16-31]
+				BF16_F32_BETA_OP(c_float_5p1,ir,5,1,selector1,selector2)
+
+				// c[5,32-47]
+				BF16_F32_BETA_OP(c_float_5p2,ir,5,2,selector1,selector2)
+
+				// c[5,48-63]
+				BF16_F32_BETA_OP(c_float_5p3,ir,5,3,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP(c_float_0p0,ir,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				F32_F32_BETA_OP(c_float_0p1,ir,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				F32_F32_BETA_OP(c_float_0p2,ir,0,2,selector1,selector2)
+
+				// c[0,48-63]
+				F32_F32_BETA_OP(c_float_0p3,ir,0,3,selector1,selector2)
+
+				// c[1,0-15]
+				F32_F32_BETA_OP(c_float_1p0,ir,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				F32_F32_BETA_OP(c_float_1p1,ir,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				F32_F32_BETA_OP(c_float_1p2,ir,1,2,selector1,selector2)
+
+				// c[1,48-63]
+				F32_F32_BETA_OP(c_float_1p3,ir,1,3,selector1,selector2)
+
+				// c[2,0-15]
+				F32_F32_BETA_OP(c_float_2p0,ir,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				F32_F32_BETA_OP(c_float_2p1,ir,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				F32_F32_BETA_OP(c_float_2p2,ir,2,2,selector1,selector2)
+
+				// c[2,48-63]
+				F32_F32_BETA_OP(c_float_2p3,ir,2,3,selector1,selector2)
+
+				// c[3,0-15]
+				F32_F32_BETA_OP(c_float_3p0,ir,3,0,selector1,selector2)
+
+				// c[3,16-31]
+				F32_F32_BETA_OP(c_float_3p1,ir,3,1,selector1,selector2)
+
+				// c[3,32-47]
+				F32_F32_BETA_OP(c_float_3p2,ir,3,2,selector1,selector2)
+
+				// c[0,48-63]
+				F32_F32_BETA_OP(c_float_3p3,ir,3,3,selector1,selector2)
+
+				// c[4,0-15]
+				F32_F32_BETA_OP(c_float_4p0,ir,4,0,selector1,selector2)
+
+				// c[4,16-31]
+				F32_F32_BETA_OP(c_float_4p1,ir,4,1,selector1,selector2)
+
+				// c[4,32-47]
+				F32_F32_BETA_OP(c_float_4p2,ir,4,2,selector1,selector2)
+
+				// c[4,48-63]
+				F32_F32_BETA_OP(c_float_4p3,ir,4,3,selector1,selector2)
+
+				// c[5,0-15]
+				F32_F32_BETA_OP(c_float_5p0,ir,5,0,selector1,selector2)
+
+				// c[5,16-31]
+				F32_F32_BETA_OP(c_float_5p1,ir,5,1,selector1,selector2)
+
+				// c[5,32-47]
+				F32_F32_BETA_OP(c_float_5p2,ir,5,2,selector1,selector2)
+
+				// c[5,48-63]
+				F32_F32_BETA_OP(c_float_5p3,ir,5,3,selector1,selector2)
+
+			}
+
+		}
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x64:
+		{
+			__m512 selector3;
+			__m512 selector4;
+
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				selector1 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+				selector2 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+				selector3 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+				selector4 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[0, 16-31]
+				c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+				// c[0,32-47]
+				c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+				// c[0,48-63]
+				c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+				// c[1, 16-31]
+				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+				// c[1,32-47]
+				c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+				// c[1,48-63]
+				c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+				// c[2, 16-31]
+				c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+				// c[2,32-47]
+				c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+				// c[2,48-63]
+				c_float_2p3 = _mm512_add_ps( selector4, c_float_2p3 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+				// c[3, 16-31]
+				c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+
+				// c[3,32-47]
+				c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
+
+				// c[3,48-63]
+				c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+
+				// c[4, 16-31]
+				c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
+
+				// c[4,32-47]
+				c_float_4p2 = _mm512_add_ps( selector3, c_float_4p2 );
+
+				// c[4,48-63]
+				c_float_4p3 = _mm512_add_ps( selector4, c_float_4p3 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
+
+				// c[5, 16-31]
+				c_float_5p1 = _mm512_add_ps( selector2, c_float_5p1 );
+
+				// c[5,32-47]
+				c_float_5p2 = _mm512_add_ps( selector3, c_float_5p2 );
+
+				// c[5,48-63]
+				c_float_5p3 = _mm512_add_ps( selector4, c_float_5p3 );
+			}
+			else
+			{
+				// If original output was columns major, then by the time
+				// kernel sees it, the matrix would be accessed as if it were
+				// transposed. Due to this the bias array will be accessed by
+				// the ic index, and each bias element corresponds to an
+				// entire row of the transposed output array, instead of an
+				// entire column.
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 1 ) );
+				selector3 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 2 ) );
+				selector4 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 3 ) );
+				__m512 selector5 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 4 ) );
+				__m512 selector6 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 5 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[0, 16-31]
+				c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+				// c[0,32-47]
+				c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+				// c[0,48-63]
+				c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+				// c[1, 16-31]
+				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+				// c[1,32-47]
+				c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+				// c[1,48-63]
+				c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+				// c[2, 16-31]
+				c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+				// c[2,32-47]
+				c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+				// c[2,48-63]
+				c_float_2p3 = _mm512_add_ps( selector3, c_float_2p3 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+				// c[3, 16-31]
+				c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+
+				// c[3,32-47]
+				c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
+
+				// c[3,48-63]
+				c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+
+				// c[4, 16-31]
+				c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
+
+				// c[4,32-47]
+				c_float_4p2 = _mm512_add_ps( selector5, c_float_4p2 );
+
+				// c[4,48-63]
+				c_float_4p3 = _mm512_add_ps( selector5, c_float_4p3 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
+
+				// c[5, 16-31]
+				c_float_5p1 = _mm512_add_ps( selector6, c_float_5p1 );
+
+				// c[5,32-47]
+				c_float_5p2 = _mm512_add_ps( selector6, c_float_5p2 );
+
+				// c[5,48-63]
+				c_float_5p3 = _mm512_add_ps( selector6, c_float_5p3 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x64:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			// c[1,16-31]
+			c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+			// c[2,16-31]
+			c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
+
+			// c[2,48-63]
+			c_float_2p3 = _mm512_max_ps( selector1, c_float_2p3 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+			// c[3,16-31]
+			c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
+
+			// c[3,48-63]
+			c_float_3p3 = _mm512_max_ps( selector1, c_float_3p3 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+			// c[4,16-31]
+			c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
+
+			// c[4,32-47]
+			c_float_4p2 = _mm512_max_ps( selector1, c_float_4p2 );
+
+			// c[4,48-63]
+			c_float_4p3 = _mm512_max_ps( selector1, c_float_4p3 );
+
+			// c[5,0-15]
+			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
+
+			// c[5,16-31]
+			c_float_5p1 = _mm512_max_ps( selector1, c_float_5p1 );
+
+			// c[5,32-47]
+			c_float_5p2 = _mm512_max_ps( selector1, c_float_5p2 );
+
+			// c[5,48-63]
+			c_float_5p3 = _mm512_max_ps( selector1, c_float_5p3 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x64:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+			// c[0, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+			// c[0, 48-63]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p3)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+			// c[1, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+			// c[1, 48-63]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p3)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+			// c[2, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p2)
+
+			// c[2, 48-63]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p3)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+			// c[3, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p2)
+
+			// c[3, 48-63]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p3)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p1)
+
+			// c[4, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p2)
+
+			// c[4, 48-63]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p3)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p1)
+
+			// c[5, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p2)
+
+			// c[5, 48-63]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p3)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x64:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 32-47]
+			GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 48-63]
+			GELU_TANH_F32_AVX512(c_float_0p3, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 32-47]
+			GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 48-63]
+			GELU_TANH_F32_AVX512(c_float_1p3, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 32-47]
+			GELU_TANH_F32_AVX512(c_float_2p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 48-63]
+			GELU_TANH_F32_AVX512(c_float_2p3, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 32-47]
+			GELU_TANH_F32_AVX512(c_float_3p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 48-63]
+			GELU_TANH_F32_AVX512(c_float_3p3, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_F32_AVX512(c_float_4p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 32-47]
+			GELU_TANH_F32_AVX512(c_float_4p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 48-63]
+			GELU_TANH_F32_AVX512(c_float_4p3, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_F32_AVX512(c_float_5p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_F32_AVX512(c_float_5p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 32-47]
+			GELU_TANH_F32_AVX512(c_float_5p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 48-63]
+			GELU_TANH_F32_AVX512(c_float_5p3, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x64:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+			// c[0, 32-47]
+			GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+			// c[0, 48-63]
+			GELU_ERF_F32_AVX512(c_float_0p3, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+			// c[1, 32-47]
+			GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+			// c[1, 48-63]
+			GELU_ERF_F32_AVX512(c_float_1p3, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+			// c[2, 32-47]
+			GELU_ERF_F32_AVX512(c_float_2p2, r, x, x_erf)
+
+			// c[2, 48-63]
+			GELU_ERF_F32_AVX512(c_float_2p3, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+			// c[3, 32-47]
+			GELU_ERF_F32_AVX512(c_float_3p2, r, x, x_erf)
+
+			// c[3, 48-63]
+			GELU_ERF_F32_AVX512(c_float_3p3, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_F32_AVX512(c_float_4p1, r, x, x_erf)
+
+			// c[4, 32-47]
+			GELU_ERF_F32_AVX512(c_float_4p2, r, x, x_erf)
+
+			// c[4, 48-63]
+			GELU_ERF_F32_AVX512(c_float_4p3, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_F32_AVX512(c_float_5p0, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_F32_AVX512(c_float_5p1, r, x, x_erf)
+
+			// c[5, 32-47]
+			GELU_ERF_F32_AVX512(c_float_5p2, r, x, x_erf)
+
+			// c[5, 48-63]
+			GELU_ERF_F32_AVX512(c_float_5p3, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_CLIP_6x64:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_F32_AVX512(c_float_0p1, min, max)
+
+			// c[0, 32-47]
+			CLIP_F32_AVX512(c_float_0p2, min, max)
+
+			// c[0, 48-63]
+			CLIP_F32_AVX512(c_float_0p3, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_F32_AVX512(c_float_1p1, min, max)
+
+			// c[1, 32-47]
+			CLIP_F32_AVX512(c_float_1p2, min, max)
+
+			// c[1, 48-63]
+			CLIP_F32_AVX512(c_float_1p3, min, max)
+
+			// c[2, 0-15]
+			CLIP_F32_AVX512(c_float_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_F32_AVX512(c_float_2p1, min, max)
+
+			// c[2, 32-47]
+			CLIP_F32_AVX512(c_float_2p2, min, max)
+
+			// c[2, 48-63]
+			CLIP_F32_AVX512(c_float_2p3, min, max)
+
+			// c[3, 0-15]
+			CLIP_F32_AVX512(c_float_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_F32_AVX512(c_float_3p1, min, max)
+
+			// c[3, 32-47]
+			CLIP_F32_AVX512(c_float_3p2, min, max)
+
+			// c[3, 48-63]
+			CLIP_F32_AVX512(c_float_3p3, min, max)
+
+			// c[4, 0-15]
+			CLIP_F32_AVX512(c_float_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_F32_AVX512(c_float_4p1, min, max)
+
+			// c[4, 32-47]
+			CLIP_F32_AVX512(c_float_4p2, min, max)
+
+			// c[4, 48-63]
+			CLIP_F32_AVX512(c_float_4p3, min, max)
+
+			// c[5, 0-15]
+			CLIP_F32_AVX512(c_float_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_F32_AVX512(c_float_5p1, min, max)
+
+			// c[5, 32-47]
+			CLIP_F32_AVX512(c_float_5p2, min, max)
+
+			// c[5, 48-63]
+			CLIP_F32_AVX512(c_float_5p3, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x64:
+{
+	        // c[0, 0-15]
+			MULRND_F32(c_float_0p0,0,0);
+
+			// c[0, 16-31]
+			MULRND_F32(c_float_0p1,0,1);
+
+			// c[0, 32-47]
+			MULRND_F32(c_float_0p2,0,2);
+
+			// c[0, 48-63]
+			MULRND_F32(c_float_0p3,0,3);
+
+			// c[1, 0-15]
+			MULRND_F32(c_float_1p0,1,0);
+
+			// c[1, 16-31]
+			MULRND_F32(c_float_1p1,1,1);
+
+			// c[1, 32-47]
+			MULRND_F32(c_float_1p2,1,2);
+
+			// c[1, 48-63]
+			MULRND_F32(c_float_1p3,1,3);
+
+			// c[2, 0-15]
+			MULRND_F32(c_float_2p0,2,0);
+
+			// c[2, 16-31]
+			MULRND_F32(c_float_2p1,2,1);
+
+			// c[2, 32-47]
+			MULRND_F32(c_float_2p2,2,2);
+
+			// c[2, 48-63]
+			MULRND_F32(c_float_2p3,2,3);
+
+			// c[3, 0-15]
+			MULRND_F32(c_float_3p0,3,0);
+
+			// c[3, 16-31]
+			MULRND_F32(c_float_3p1,3,1);
+
+			// c[3, 32-47]
+			MULRND_F32(c_float_3p2,3,2);
+
+			// c[3, 48-63]
+			MULRND_F32(c_float_3p3,3,3);
+
+			// c[4, 0-15]
+			MULRND_F32(c_float_4p0,4,0);
+
+			// c[4, 16-31]
+			MULRND_F32(c_float_4p1,4,1);
+
+			// c[4, 32-47]
+			MULRND_F32(c_float_4p2,4,2);
+
+			// c[4, 48-63]
+			MULRND_F32(c_float_4p3,4,3);
+
+			// c[5, 0-15]
+			MULRND_F32(c_float_5p0,5,0);
+
+			// c[5, 16-31]
+			MULRND_F32(c_float_5p1,5,1);
+
+			// c[5, 32-47]
+			MULRND_F32(c_float_5p2,5,2);
+
+			// c[5, 48-63]
+			MULRND_F32(c_float_5p3,5,3);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+}
+
+POST_OPS_6x64_DISABLE:
+		;
+
+		// Case where the output C matrix is bf16 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			__m512i selector_a = _mm512_setzero_epi32();
+			__m512i selector_b = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+			// Store the results in downscaled type (bf16 instead of float).
+
+			// c[0, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+			// c[0, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+			// c[0, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+			// c[0, 48-63]
+			CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3);
+
+			// c[1, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+			// c[1, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+			// c[1, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+			// c[1, 48-63]
+			CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3);
+
+			// c[2, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+			// c[2, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+			// c[2, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2);
+
+			// c[2, 48-63]
+			CVT_STORE_F32_BF16_MASK(c_float_2p3,2,3);
+
+			// c[3, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+			// c[3, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+
+			// c[3, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2);
+
+			// c[3, 48-63]
+			CVT_STORE_F32_BF16_MASK(c_float_3p3,3,3);
+
+			// c[4, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+
+			// c[4, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1);
+
+			// c[4, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_4p2,4,2);
+
+			// c[4, 48-63]
+			CVT_STORE_F32_BF16_MASK(c_float_4p3,4,3);
+
+			// c[5, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0);
+
+			// c[5, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_5p1,5,1);
+
+			// c[5, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_5p2,5,2);
+
+			// c[5, 48-63]
+			CVT_STORE_F32_BF16_MASK(c_float_5p3,5,3);
+
+		}
+
+		// Case where the output C matrix is float 
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 );
+
+			// c[0,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_float_0p2 );
+
+			// c[0,48-63]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_float_0p3 );
+
+			// c[1,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 );
+
+			// c[1,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_float_1p2 );
+
+			// c[1,48-63]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_float_1p3 );
+
+			// c[2,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 );
+
+			// c[2,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_float_2p2 );
+
+			// c[2,48-63]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_float_2p3 );
+
+			// c[3,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 );
+
+			// c[3,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_float_3p2 );
+
+			// c[3,48-63]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_float_3p3 );
+
+			// c[4,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 );
+
+			// c[4,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_float_4p2 );
+
+			// c[4,48-63]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_float_4p3 );
+
+			// c[5,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 );
+
+			// c[5,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_float_5p2 );
+
+			// c[5,48-63]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_float_5p3 );
+
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			// In cases where A matrix is packed cs_a is set to 12, since the
+			// next column in a given row is accessed after 2*6 elements, where
+			// 6 is MR and 2 elements are broadcasted each time from A (bf16).
+			// In fringe case, where m < MR, the next column will be after m'*2
+			// elements, and subsequently following adjustment of cs_a is
+			// required before calling m fringe kernels.
+			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_bf16bf16f32of32_5x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_bf16bf16f32of32_4x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_bf16bf16f32of32_3x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_bf16bf16f32of32_2x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_bf16bf16f32of32_1x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+#endif //LPGEMM_BF16_NOT_SUPPORTED
+#endif
diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h
new file mode 100644
index 0000000000..f3875647eb
--- /dev/null
+++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_f32_kern_macros.h
@@ -0,0 +1,122 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_F32_KERN_MACROS_H
+#define LPGEMM_F32_KERN_MACROS_H
+
+#include "../gelu_avx512.h"
+#include "../math_utils_avx512.h"
+
+// Disable BF16 kernel in cases where compilers support other avx 512
+// features except BF16 ISA.
+#if defined( BLIS_GCC ) && ( __GNUC__ < 10 )
+#define LPGEMM_BF16_NOT_SUPPORTED
+#endif
+
+/* ReLU scale (Parametric ReLU):  f(x) = x, when x > 0 and f(x) = a*x when x <= 0 */
+#define RELU_SCALE_OP_F32_AVX512(reg) \
+	/* Generate indenx of elements <= 0.*/ \
+	relu_cmp_mask = _mm512_cmple_ps_mask( reg, selector1 ); \
+ \
+	/* Apply scaling on for <= 0 elements.*/ \
+	reg = _mm512_mask_mul_ps( reg, relu_cmp_mask, reg, selector2 ); \
+
+// F32 fma macro
+#define F32_BETA_FMA(reg,scratch1,scratch2) \
+	scratch1 = _mm512_mul_ps( scratch2, scratch1 ); \
+	reg = _mm512_add_ps( scratch1, reg ); \
+
+// Beta scale macro, scratch2=beta
+#define F32_F32_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = \
+	_mm512_loadu_ps \
+	( \
+	  ( c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ) \
+	); \
+	F32_BETA_FMA(reg,scratch1,scratch2) \
+
+// Downscale beta scale macro, scratch2=beta
+#define BF16_F32_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = \
+	  (__m512)( _mm512_sllv_epi32( _mm512_cvtepi16_epi32( (__m256i)_mm256_loadu_epi16 \
+	  ( \
+	    ( ( bfloat16* )post_ops_attr.buf_downscale + \
+	    ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+	    post_ops_attr.post_op_c_j + ( n_ind * 16 ) )\
+	  ) ), _mm512_set1_epi32 (16) ) );\
+	F32_BETA_FMA(reg,scratch1,scratch2) \
+
+// Default n < 16 mask load beta macro
+#define F32_F32_BETA_OP_NLT16F_MASK(lmask,reg,m_ir,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = _mm512_maskz_loadu_ps( lmask, c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ); \
+	F32_BETA_FMA(reg,scratch1,scratch2) \
+
+// Downscale n < 16 mask load beta macro
+#define BF16_F32_BETA_OP_NLT16F_MASK(lmask,reg,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 =  \
+	  (__m512)( _mm512_sllv_epi32( _mm512_cvtepi16_epi32( (__m256i)_mm256_maskz_loadu_epi16 \
+	  ( \
+	    lmask, \
+	    ( bfloat16* )post_ops_attr.buf_downscale + \
+	    ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+	    post_ops_attr.post_op_c_j + ( n_ind * 16 ) \
+	  ) ), _mm512_set1_epi32 (16) ) );\
+	F32_BETA_FMA(reg,scratch1,scratch2) \
+
+#define MULRND_F32(reg,m_ind,n_ind) \
+
+#define CVT_STORE_F32_BF16_MASK(reg,m_ind,n_ind) \
+	_mm256_mask_storeu_epi16 \
+	( \
+	  ( bfloat16* )post_ops_attr.buf_downscale + \
+	  ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+	  post_ops_attr.post_op_c_j + ( n_ind * 16 ), \
+	  mask_all1, (__m256i) _mm512_cvtneps_pbh( reg ) \
+	) \
+
+/* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */
+#define GELU_TANH_F32_AVX512(reg, r, r2, x, z, dn, x_tanh, q) \
+\
+	GELU_TANH_F32_AVX512_DEF(reg, r, r2, x, z, dn, x_tanh, q); \
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_F32_AVX512(reg, r, x, x_erf) \
+\
+	GELU_ERF_F32_AVX512_DEF(reg, r, x, x_erf); \
+
+#define CLIP_F32_AVX512(reg, min, max) \
+\
+	reg = _mm512_min_ps( _mm512_max_ps( reg, min ), max ); \
+
+#endif // LPGEMM_F32_KERN_MACROS_H
diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c
similarity index 53%
rename from addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c
rename to kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c
index e4418b2a0e..e3e3bc2869 100644
--- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c
+++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_m_fringe_bf16_amd512vnni.c
@@ -1,2592 +1,3559 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <immintrin.h>
-#include <string.h>
-
-#include "blis.h"
-#include "lpgemm_kernels.h"
-#include "lpgemm_f32_kern_macros.h"
-
-#ifdef BLIS_KERNELS_ZEN4
-// 5x64 bf16 kernel
-LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5x64_DISABLE,
-						  &&POST_OPS_BIAS_5x64,
-						  &&POST_OPS_RELU_5x64,
-						  &&POST_OPS_RELU_SCALE_5x64,
-						  &&POST_OPS_DOWNSCALE_5x64
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-	__m512bh b3;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-	__m512bh a_bf16_1;
-	
-	// Registers to use for accumulating C.
-    __m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-	__m512 c_float_0p3 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-	__m512 c_float_1p2 = _mm512_setzero_ps();
-	__m512 c_float_1p3 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-	__m512 c_float_2p2 = _mm512_setzero_ps();
-	__m512 c_float_2p3 = _mm512_setzero_ps();
-		
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-	__m512 c_float_3p1 = _mm512_setzero_ps();
-	__m512 c_float_3p2 = _mm512_setzero_ps();
-	__m512 c_float_3p3 = _mm512_setzero_ps();
-
-	__m512 c_float_4p0 = _mm512_setzero_ps();
-	__m512 c_float_4p1 = _mm512_setzero_ps();
-	__m512 c_float_4p2 = _mm512_setzero_ps();
-	__m512 c_float_4p3 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		c_float_2p1 =  _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 =  _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		c_float_2p3 =  _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_3p0 =  _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
-		
-		// Broadcast a[4,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
-		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
-		c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-		c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
-		c_float_4p3 = _mm512_dpbf16_ps( c_float_4p3, a_bf16_0, b3 );
-	}	
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+4].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
-		
-		// Broadcast a[4,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
-		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
-		c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-		c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
-		c_float_4p3 = _mm512_dpbf16_ps( c_float_4p3, a_bf16_0, b3 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-	c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-	c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-	c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
-	c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-	c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-	c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
-	c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 );
-	
-	c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-	c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
-	c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 );
-	c_float_4p3 = _mm512_mul_ps( selector1, c_float_4p3 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-		// c[1,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 );
-
-		// c[2,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p3 = _mm512_add_ps( selector1, c_float_2p3 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-
-		// c[3,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 );
-
-		// c[3,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p3 = _mm512_add_ps( selector1, c_float_3p3 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-		// c[4,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 );
-
-		// c[4,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p2 = _mm512_add_ps( selector1, c_float_4p2 );
-
-		// c[4,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p3 = _mm512_add_ps( selector1, c_float_4p3 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5x64:
-	{
-		__m512 selector3;
-		__m512 selector4;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-			selector4 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 3 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[2,48-63]
-			c_float_2p3 = _mm512_add_ps( selector4, c_float_2p3 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
-
-			// c[3,48-63]
-			c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-			// c[4, 16-31]
-			c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
-
-			// c[4,32-47]
-			c_float_4p2 = _mm512_add_ps( selector3, c_float_4p2 );
-
-			// c[4,48-63]
-			c_float_4p3 = _mm512_add_ps( selector4, c_float_4p3 );
-		}
-		else
-		{
-			// If original output was columns major, then by the time
-			// kernel sees it, the matrix would be accessed as if it were
-			// transposed. Due to this the bias array will be accessed by
-			// the ic index, and each bias element corresponds to an
-			// entire row of the transposed output array, instead of an
-			// entire column.
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 1 ) );
-			selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 2 ) );
-			selector4 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 3 ) );
-			__m512 selector5 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 4 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[2,48-63]
-			c_float_2p3 = _mm512_add_ps( selector3, c_float_2p3 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
-
-			// c[3,48-63]
-			c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-
-			// c[4, 16-31]
-			c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
-
-			// c[4,32-47]
-			c_float_4p2 = _mm512_add_ps( selector5, c_float_4p2 );
-
-			// c[4,48-63]
-			c_float_4p3 = _mm512_add_ps( selector5, c_float_4p3 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_5x64:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-		// c[1,48-63]
-		c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
-
-		// c[2,48-63]
-		c_float_2p3 = _mm512_max_ps( selector1, c_float_2p3 );
-
-		// c[3,0-15]
-		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-		// c[3,32-47]
-		c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
-
-		// c[3,48-63]
-		c_float_3p3 = _mm512_max_ps( selector1, c_float_3p3 );
-
-		// c[4,0-15]
-		c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-		// c[4,16-31]
-		c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
-
-		// c[4,32-47]
-		c_float_4p2 = _mm512_max_ps( selector1, c_float_4p2 );
-
-		// c[4,48-63]
-		c_float_4p3 = _mm512_max_ps( selector1, c_float_4p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_5x64:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-		// c[1, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p3)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
-
-		// c[2, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p3)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-		// c[3, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p2)
-
-		// c[3, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p3)
-
-		// c[4, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-		// c[4, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p1)
-
-		// c[4, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p2)
-
-		// c[4, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_5x64:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[0, 48-63]
-		CVT_F32_BF16(c_float_0p3,0,3);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		// c[1, 48-63]
-		CVT_F32_BF16(c_float_1p3,1,3);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[2, 32-47]
-		CVT_F32_BF16(c_float_2p2,2,2);
-
-		// c[2, 48-63]
-		CVT_F32_BF16(c_float_2p3,2,3);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[3, 16-31]
-		CVT_F32_BF16(c_float_3p1,3,1);
-
-		// c[3, 32-47]
-		CVT_F32_BF16(c_float_3p2,3,2);
-
-		// c[3, 48-63]
-		CVT_F32_BF16(c_float_3p3,3,3);
-
-		// c[4, 0-15]
-		CVT_F32_BF16(c_float_4p0,4,0);
-
-		// c[4, 16-31]
-		CVT_F32_BF16(c_float_4p1,4,1);
-
-		// c[4, 32-47]
-		CVT_F32_BF16(c_float_4p2,4,2);
-
-		// c[4, 48-63]
-		CVT_F32_BF16(c_float_4p3,4,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_5x64_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
-
-	// c[1,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
-
-	// c[2,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
-
-	// c[3,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 );
-
-	// c[3,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 3*16 ), c_float_3p3 );
-
-	// c[4,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 );
-
-	// c[4,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 );
-
-	// c[4,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 2*16 ), c_float_4p2 );
-
-	// c[4,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 3*16 ), c_float_4p3 );
-}
-
-// 4x64 bf16 kernel
-LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4x64_DISABLE,
-						  &&POST_OPS_BIAS_4x64,
-						  &&POST_OPS_RELU_4x64,
-						  &&POST_OPS_RELU_SCALE_4x64,
-						  &&POST_OPS_DOWNSCALE_4x64
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-	__m512bh b3;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-	__m512bh a_bf16_1;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-	__m512 c_float_0p3 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-	__m512 c_float_1p2 = _mm512_setzero_ps();
-	__m512 c_float_1p3 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-	__m512 c_float_2p2 = _mm512_setzero_ps();
-	__m512 c_float_2p3 = _mm512_setzero_ps();
-	
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-	__m512 c_float_3p1 = _mm512_setzero_ps();
-	__m512 c_float_3p2 = _mm512_setzero_ps();
-	__m512 c_float_3p3 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-		
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
-		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
-		c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
-	}
-	
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16) )
-		);
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-		
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
-		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
-		c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
-	}
-     
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-	c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-	c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-	c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
-	c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-	c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-	c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
-	c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-		// c[1,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 );
-
-		// c[2,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p3 = _mm512_add_ps( selector1, c_float_2p3 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-
-		// c[3,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 );
-
-		// c[3,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p3 = _mm512_add_ps( selector1, c_float_3p3 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4x64:
-	{
-		__m512 selector3;
-		__m512 selector4;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-			selector4 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 3 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[2,48-63]
-			c_float_2p3 = _mm512_add_ps( selector4, c_float_2p3 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
-
-			// c[3,48-63]
-			c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
-		}
-		else
-		{
-			// If original output was columns major, then by the time
-			// kernel sees it, the matrix would be accessed as if it were
-			// transposed. Due to this the bias array will be accessed by
-			// the ic index, and each bias element corresponds to an
-			// entire row of the transposed output array, instead of an
-			// entire column.
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 1 ) );
-			selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 2 ) );
-			selector4 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 3 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[2,48-63]
-			c_float_2p3 = _mm512_add_ps( selector3, c_float_2p3 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
-
-			// c[3,48-63]
-			c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_4x64:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-		// c[1,48-63]
-		c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
-
-		// c[2,48-63]
-		c_float_2p3 = _mm512_max_ps( selector1, c_float_2p3 );
-
-		// c[3,0-15]
-		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-		// c[3,32-47]
-		c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
-
-		// c[3,48-63]
-		c_float_3p3 = _mm512_max_ps( selector1, c_float_3p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_4x64:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-		// c[1, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p3)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
-
-		// c[2, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p3)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-		// c[3, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p2)
-
-		// c[3, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_4x64:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[0, 48-63]
-		CVT_F32_BF16(c_float_0p3,0,3);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		// c[1, 48-63]
-		CVT_F32_BF16(c_float_1p3,1,3);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[2, 32-47]
-		CVT_F32_BF16(c_float_2p2,2,2);
-
-		// c[2, 48-63]
-		CVT_F32_BF16(c_float_2p3,2,3);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[3, 16-31]
-		CVT_F32_BF16(c_float_3p1,3,1);
-
-		// c[3, 32-47]
-		CVT_F32_BF16(c_float_3p2,3,2);
-
-		// c[3, 48-63]
-		CVT_F32_BF16(c_float_3p3,3,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-
-POST_OPS_4x64_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
-
-	// c[1,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
-
-	// c[2,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
-
-	// c[3,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 );
-
-	// c[3,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 3*16 ), c_float_3p3 );
-}
-
-// 3x64 bf16 kernel
-LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3x64_DISABLE,
-						  &&POST_OPS_BIAS_3x64,
-						  &&POST_OPS_RELU_3x64,
-						  &&POST_OPS_RELU_SCALE_3x64,
-						  &&POST_OPS_DOWNSCALE_3x64
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-	__m512bh b3;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-	__m512bh a_bf16_1;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-	__m512 c_float_0p3 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-	__m512 c_float_1p2 = _mm512_setzero_ps();
-	__m512 c_float_1p3 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-	__m512 c_float_2p2 = _mm512_setzero_ps();
-	__m512 c_float_2p3 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a *  0 ) + ( cs_a * kr ) ) );
-
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-		
-		// Broadcast a[2,kr:kr+4].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
-	}
-	
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
-	}
-    
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-    // Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-	c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-	c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-	c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
-	c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-		// c[1,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 );
-
-		// c[2,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p3 = _mm512_add_ps( selector1, c_float_2p3 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3x64:
-	{
-		__m512 selector3;
-		__m512 selector4;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-			selector4 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 3 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[2,48-63]
-			c_float_2p3 = _mm512_add_ps( selector4, c_float_2p3 );
-		}
-		else
-		{
-			// If original output was columns major, then by the time
-			// kernel sees it, the matrix would be accessed as if it were
-			// transposed. Due to this the bias array will be accessed by
-			// the ic index, and each bias element corresponds to an
-			// entire row of the transposed output array, instead of an
-			// entire column.
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 1 ) );
-			selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 2 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[2,48-63]
-			c_float_2p3 = _mm512_add_ps( selector3, c_float_2p3 );
-		}
-		
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_3x64:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-		// c[1,48-63]
-		c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
-
-		// c[2,48-63]
-		c_float_2p3 = _mm512_max_ps( selector1, c_float_2p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_3x64:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-		// c[1, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p3)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
-
-		// c[2, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_3x64:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[0, 48-63]
-		CVT_F32_BF16(c_float_0p3,0,3);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		// c[1, 48-63]
-		CVT_F32_BF16(c_float_1p3,1,3);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[2, 32-47]
-		CVT_F32_BF16(c_float_2p2,2,2);
-
-		// c[2, 48-63]
-		CVT_F32_BF16(c_float_2p3,2,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_3x64_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
-
-	// c[1,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
-
-	// c[2,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 );
-}
-
-// 2x64 bf16 kernel
-LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2x64_DISABLE,
-						  &&POST_OPS_BIAS_2x64,
-						  &&POST_OPS_RELU_2x64,
-						  &&POST_OPS_RELU_SCALE_2x64,
-						  &&POST_OPS_DOWNSCALE_2x64
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-	__m512bh b3;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-	__m512bh a_bf16_1;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-	__m512 c_float_0p3 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-	__m512 c_float_1p2 = _mm512_setzero_ps();
-	__m512 c_float_1p3 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-		
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-	}
-	
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		a_bf16_1 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
-		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
-	}
-	
-
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-	c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-	c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-		// c[1,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p3 = _mm512_add_ps( selector1, c_float_1p3 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2x64:
-	{
-		__m512 selector3;
-		__m512 selector4;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-			selector4 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 3 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
-		}
-		else
-		{
-			// If original output was columns major, then by the time
-			// kernel sees it, the matrix would be accessed as if it were
-			// transposed. Due to this the bias array will be accessed by
-			// the ic index, and each bias element corresponds to an
-			// entire row of the transposed output array, instead of an
-			// entire column.
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 1 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-			// c[1,48-63]
-			c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_2x64:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-		// c[1,48-63]
-		c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_2x64:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-		// c[1, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-	POST_OPS_DOWNSCALE_2x64:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[0, 48-63]
-		CVT_F32_BF16(c_float_0p3,0,3);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		// c[1, 48-63]
-		CVT_F32_BF16(c_float_1p3,1,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_2x64_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
-
-	// c[1,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 );
-}
-
-// 1x64 bf16 kernel
-LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x64)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1x64_DISABLE,
-						  &&POST_OPS_BIAS_1x64,
-						  &&POST_OPS_RELU_1x64,
-						  &&POST_OPS_RELU_SCALE_1x64,
-						  &&POST_OPS_DOWNSCALE_1x64
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	//  Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-	__m512 c_float_0p3 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		__m512bh b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr]
-		__m512bh a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		__m512bh b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		__m512bh b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-		__m512bh b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-        // c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-	}
-	
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		__m512bh b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy
-		(
-		  &a_kfringe_buf,
-		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-		  ( k_partial_pieces * sizeof( bfloat16 ) )
-		);
-		__m512bh a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		__m512bh b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		__m512bh b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-		__m512bh b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
-
-		// Perform column direction mat-mul with k = 2.
-        // c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-	c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
-	
-	// Scale C by beta.
-	if ( beta != 0)
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 3*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1x64:
-	{
-		__m512 selector3;
-		__m512 selector4;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-			selector4 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 3 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
-		}
-		else
-		{
-			// If original output was columns major, then by the time
-			// kernel sees it, the matrix would be accessed as if it were
-			// transposed. Due to this the bias array will be accessed by
-			// the ic index, and each bias element corresponds to an
-			// entire row of the transposed output array, instead of an
-			// entire column.
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_i + 0 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[0,48-63]
-			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_1x64:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[0,48-63]
-		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_1x64:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[0, 48-63]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_1x64:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[0, 48-63]
-		CVT_F32_BF16(c_float_0p3,0,3);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_1x64_DISABLE:
-	;
-    
-	// Store the accumulated results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[0,48-63]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
-}
-#endif
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_f32_kern_macros.h"
+
+#ifndef LPGEMM_BF16_NOT_SUPPORTED
+// 5x64 bf16 kernel
+LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x64_DISABLE,
+						  &&POST_OPS_BIAS_5x64,
+						  &&POST_OPS_RELU_5x64,
+						  &&POST_OPS_RELU_SCALE_5x64,
+						  &&POST_OPS_GELU_TANH_5x64,
+						  &&POST_OPS_GELU_ERF_5x64,
+						  &&POST_OPS_CLIP_5x64,
+						  &&POST_OPS_DOWNSCALE_5x64
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+	__m512bh b3;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+	__m512bh a_bf16_1;
+
+	// Registers to use for accumulating C.
+    __m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+	__m512 c_float_0p3 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+	__m512 c_float_1p2 = _mm512_setzero_ps();
+	__m512 c_float_1p3 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+	__m512 c_float_2p2 = _mm512_setzero_ps();
+	__m512 c_float_2p3 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+	__m512 c_float_3p1 = _mm512_setzero_ps();
+	__m512 c_float_3p2 = _mm512_setzero_ps();
+	__m512 c_float_3p3 = _mm512_setzero_ps();
+
+	__m512 c_float_4p0 = _mm512_setzero_ps();
+	__m512 c_float_4p1 = _mm512_setzero_ps();
+	__m512 c_float_4p2 = _mm512_setzero_ps();
+	__m512 c_float_4p3 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		c_float_2p1 =  _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 =  _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+		c_float_2p3 =  _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_3p0 =  _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
+		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
+		c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+		c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
+		c_float_4p3 = _mm512_dpbf16_ps( c_float_4p3, a_bf16_0, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
+		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
+		c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-63] = a[4,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+		c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
+		c_float_4p3 = _mm512_dpbf16_ps( c_float_4p3, a_bf16_0, b3 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+		c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+		c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+		c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
+		c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+		c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
+		c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 );
+
+		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+		c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
+		c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 );
+		c_float_4p3 = _mm512_mul_ps( selector1, c_float_4p3 );
+
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		// For the downscaled api (C-bf16), the output C matrix values
+		// needs to be upscaled to float to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+			// c[1,32-47]
+			BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+			// c[1,48-63]
+			BF16_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2)
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+			// c[2,32-47]
+			BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+			// c[2,48-63]
+			BF16_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2)
+
+			// c[3,0-15]
+			BF16_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2)
+
+			// c[3,16-31]
+			BF16_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2)
+
+			// c[3,32-47]
+			BF16_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2)
+
+			// c[0,48-63]
+			BF16_F32_BETA_OP(c_float_3p3,0,3,3,selector1,selector2)
+
+			// c[4,0-15]
+			BF16_F32_BETA_OP(c_float_4p0,0,4,0,selector1,selector2)
+
+			// c[4,16-31]
+			BF16_F32_BETA_OP(c_float_4p1,0,4,1,selector1,selector2)
+
+			// c[4,32-47]
+			BF16_F32_BETA_OP(c_float_4p2,0,4,2,selector1,selector2)
+
+			// c[4,48-63]
+			BF16_F32_BETA_OP(c_float_4p3,0,4,3,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+
+			// c[1,0-15]
+			F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+			// c[1,32-47]
+			F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+			// c[1,48-63]
+			F32_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2)
+
+			// c[2,0-15]
+			F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+			// c[2,32-47]
+			F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+			// c[2,48-63]
+			F32_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2)
+
+			// c[3,0-15]
+			F32_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2)
+
+			// c[3,16-31]
+			F32_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2)
+
+			// c[3,32-47]
+			F32_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2)
+
+			// c[0,48-63]
+			F32_F32_BETA_OP(c_float_3p3,0,3,3,selector1,selector2)
+
+			// c[4,0-15]
+			F32_F32_BETA_OP(c_float_4p0,0,4,0,selector1,selector2)
+
+			// c[4,16-31]
+			F32_F32_BETA_OP(c_float_4p1,0,4,1,selector1,selector2)
+
+			// c[4,32-47]
+			F32_F32_BETA_OP(c_float_4p2,0,4,2,selector1,selector2)
+
+			// c[4,48-63]
+			F32_F32_BETA_OP(c_float_4p3,0,4,3,selector1,selector2)
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x64:
+	{
+		__m512 selector3;
+		__m512 selector4;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+			selector4 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[2,48-63]
+			c_float_2p3 = _mm512_add_ps( selector4, c_float_2p3 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
+
+			// c[3,48-63]
+			c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+
+			// c[4, 16-31]
+			c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
+
+			// c[4,32-47]
+			c_float_4p2 = _mm512_add_ps( selector3, c_float_4p2 );
+
+			// c[4,48-63]
+			c_float_4p3 = _mm512_add_ps( selector4, c_float_4p3 );
+		}
+		else
+		{
+			// If original output was columns major, then by the time
+			// kernel sees it, the matrix would be accessed as if it were
+			// transposed. Due to this the bias array will be accessed by
+			// the ic index, and each bias element corresponds to an
+			// entire row of the transposed output array, instead of an
+			// entire column.
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 1 ) );
+			selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 2 ) );
+			selector4 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 3 ) );
+			__m512 selector5 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 4 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[2,48-63]
+			c_float_2p3 = _mm512_add_ps( selector3, c_float_2p3 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
+
+			// c[3,48-63]
+			c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+
+			// c[4, 16-31]
+			c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
+
+			// c[4,32-47]
+			c_float_4p2 = _mm512_add_ps( selector5, c_float_4p2 );
+
+			// c[4,48-63]
+			c_float_4p3 = _mm512_add_ps( selector5, c_float_4p3 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x64:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[0,48-63]
+		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[1,32-47]
+		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+		// c[1,48-63]
+		c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		// c[2,32-47]
+		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
+
+		// c[2,48-63]
+		c_float_2p3 = _mm512_max_ps( selector1, c_float_2p3 );
+
+		// c[3,0-15]
+		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+		// c[3,16-31]
+		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+		// c[3,32-47]
+		c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
+
+		// c[3,48-63]
+		c_float_3p3 = _mm512_max_ps( selector1, c_float_3p3 );
+
+		// c[4,0-15]
+		c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+		// c[4,16-31]
+		c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
+
+		// c[4,32-47]
+		c_float_4p2 = _mm512_max_ps( selector1, c_float_4p2 );
+
+		// c[4,48-63]
+		c_float_4p3 = _mm512_max_ps( selector1, c_float_4p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x64:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p3)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p2)
+
+		// c[3, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p3)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p1)
+
+		// c[4, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p2)
+
+		// c[4, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x64:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_F32_AVX512(c_float_0p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_F32_AVX512(c_float_1p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_F32_AVX512(c_float_2p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_F32_AVX512(c_float_2p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_F32_AVX512(c_float_3p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 48-63]
+		GELU_TANH_F32_AVX512(c_float_3p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_F32_AVX512(c_float_4p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 32-47]
+		GELU_TANH_F32_AVX512(c_float_4p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 48-63]
+		GELU_TANH_F32_AVX512(c_float_4p3, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x64:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_F32_AVX512(c_float_0p3, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_F32_AVX512(c_float_1p3, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_F32_AVX512(c_float_2p2, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_F32_AVX512(c_float_2p3, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_F32_AVX512(c_float_3p2, r, x, x_erf)
+
+		// c[3, 48-63]
+		GELU_ERF_F32_AVX512(c_float_3p3, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_F32_AVX512(c_float_4p1, r, x, x_erf)
+
+		// c[4, 32-47]
+		GELU_ERF_F32_AVX512(c_float_4p2, r, x, x_erf)
+
+		// c[4, 48-63]
+		GELU_ERF_F32_AVX512(c_float_4p3, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x64:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_F32_AVX512(c_float_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_F32_AVX512(c_float_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_F32_AVX512(c_float_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_F32_AVX512(c_float_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_F32_AVX512(c_float_2p3, min, max)
+
+		// c[3, 0-15]
+		CLIP_F32_AVX512(c_float_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_F32_AVX512(c_float_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_F32_AVX512(c_float_3p2, min, max)
+
+		// c[3, 48-63]
+		CLIP_F32_AVX512(c_float_3p3, min, max)
+
+		// c[4, 0-15]
+		CLIP_F32_AVX512(c_float_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_F32_AVX512(c_float_4p1, min, max)
+
+		// c[4, 32-47]
+		CLIP_F32_AVX512(c_float_4p2, min, max)
+
+		// c[4, 48-63]
+		CLIP_F32_AVX512(c_float_4p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x64:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		MULRND_F32(c_float_0p3,0,3);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		// c[1, 48-63]
+		MULRND_F32(c_float_1p3,1,3);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		MULRND_F32(c_float_2p2,2,2);
+
+		// c[2, 48-63]
+		MULRND_F32(c_float_2p3,2,3);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		MULRND_F32(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		MULRND_F32(c_float_3p2,3,2);
+
+		// c[3, 48-63]
+		MULRND_F32(c_float_3p3,3,3);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		// c[4, 16-31]
+		MULRND_F32(c_float_4p1,4,1);
+
+		// c[4, 32-47]
+		MULRND_F32(c_float_4p2,4,2);
+
+		// c[4, 48-63]
+		MULRND_F32(c_float_4p3,4,3);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x64_DISABLE:
+	;
+	// Case where the output C matrix is bf16 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3);
+
+		// c[1, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+		// c[1, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3);
+
+		// c[2, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2);
+
+		// c[2, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_2p3,2,3);
+
+		// c[3, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2);
+
+		// c[3, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_3p3,3,3);
+
+		// c[4, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+
+		// c[4, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1);
+
+		// c[4, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_4p2,4,2);
+
+		// c[4, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_4p3,4,3);
+
+	}
+
+	// Case where the output C matrix is float 
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 );
+
+		// c[3,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 );
+
+		// c[3,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 3*16 ), c_float_3p3 );
+
+		// c[4,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 );
+
+		// c[4,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 2*16 ), c_float_4p2 );
+
+		// c[4,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 3*16 ), c_float_4p3 );
+
+	}
+}
+
+// 4x64 bf16 kernel
+LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x64_DISABLE,
+						  &&POST_OPS_BIAS_4x64,
+						  &&POST_OPS_RELU_4x64,
+						  &&POST_OPS_RELU_SCALE_4x64,
+						  &&POST_OPS_GELU_TANH_4x64,
+						  &&POST_OPS_GELU_ERF_4x64,
+						  &&POST_OPS_CLIP_4x64,
+						  &&POST_OPS_DOWNSCALE_4x64
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+	__m512bh b3;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+	__m512bh a_bf16_1;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+	__m512 c_float_0p3 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+	__m512 c_float_1p2 = _mm512_setzero_ps();
+	__m512 c_float_1p3 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+	__m512 c_float_2p2 = _mm512_setzero_ps();
+	__m512 c_float_2p3 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+	__m512 c_float_3p1 = _mm512_setzero_ps();
+	__m512 c_float_3p2 = _mm512_setzero_ps();
+	__m512 c_float_3p3 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
+		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
+		c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
+	}
+
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-63] = a[3,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_1, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_1, b1 );
+		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_1, b2 );
+		c_float_3p3 = _mm512_dpbf16_ps( c_float_3p3, a_bf16_1, b3 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+		c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+		c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+		c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
+		c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+		c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
+		c_float_3p3 = _mm512_mul_ps( selector1, c_float_3p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		// For the downscaled api (C-bf16), the output C matrix values
+		// needs to be upscaled to float to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+			// c[1,32-47]
+			BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+			// c[1,48-63]
+			BF16_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2)
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+			// c[2,32-47]
+			BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+			// c[2,48-63]
+			BF16_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2)
+
+			// c[3,0-15]
+			BF16_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2)
+
+			// c[3,16-31]
+			BF16_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2)
+
+			// c[3,32-47]
+			BF16_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2)
+
+			// c[0,48-63]
+			BF16_F32_BETA_OP(c_float_3p3,0,3,3,selector1,selector2)
+
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+
+			// c[1,0-15]
+			F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+			// c[1,32-47]
+			F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+			// c[1,48-63]
+			F32_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2)
+
+			// c[2,0-15]
+			F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+			// c[2,32-47]
+			F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+			// c[2,48-63]
+			F32_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2)
+
+			// c[3,0-15]
+			F32_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2)
+
+			// c[3,16-31]
+			F32_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2)
+
+			// c[3,32-47]
+			F32_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2)
+
+			// c[0,48-63]
+			F32_F32_BETA_OP(c_float_3p3,0,3,3,selector1,selector2)
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x64:
+	{
+		__m512 selector3;
+		__m512 selector4;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+			selector4 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[2,48-63]
+			c_float_2p3 = _mm512_add_ps( selector4, c_float_2p3 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
+
+			// c[3,48-63]
+			c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
+		}
+		else
+		{
+			// If original output was columns major, then by the time
+			// kernel sees it, the matrix would be accessed as if it were
+			// transposed. Due to this the bias array will be accessed by
+			// the ic index, and each bias element corresponds to an
+			// entire row of the transposed output array, instead of an
+			// entire column.
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 1 ) );
+			selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 2 ) );
+			selector4 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 3 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[2,48-63]
+			c_float_2p3 = _mm512_add_ps( selector3, c_float_2p3 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
+
+			// c[3,48-63]
+			c_float_3p3 = _mm512_add_ps( selector4, c_float_3p3 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x64:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[0,48-63]
+		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[1,32-47]
+		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+		// c[1,48-63]
+		c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		// c[2,32-47]
+		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
+
+		// c[2,48-63]
+		c_float_2p3 = _mm512_max_ps( selector1, c_float_2p3 );
+
+		// c[3,0-15]
+		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+		// c[3,16-31]
+		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+		// c[3,32-47]
+		c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
+
+		// c[3,48-63]
+		c_float_3p3 = _mm512_max_ps( selector1, c_float_3p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x64:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p3)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p2)
+
+		// c[3, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x64:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_F32_AVX512(c_float_0p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_F32_AVX512(c_float_1p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_F32_AVX512(c_float_2p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_F32_AVX512(c_float_2p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_F32_AVX512(c_float_3p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 48-63]
+		GELU_TANH_F32_AVX512(c_float_3p3, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x64:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_F32_AVX512(c_float_0p3, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_F32_AVX512(c_float_1p3, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_F32_AVX512(c_float_2p2, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_F32_AVX512(c_float_2p3, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_F32_AVX512(c_float_3p2, r, x, x_erf)
+
+		// c[3, 48-63]
+		GELU_ERF_F32_AVX512(c_float_3p3, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x64:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_F32_AVX512(c_float_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_F32_AVX512(c_float_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_F32_AVX512(c_float_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_F32_AVX512(c_float_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_F32_AVX512(c_float_2p3, min, max)
+
+		// c[3, 0-15]
+		CLIP_F32_AVX512(c_float_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_F32_AVX512(c_float_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_F32_AVX512(c_float_3p2, min, max)
+
+		// c[3, 48-63]
+		CLIP_F32_AVX512(c_float_3p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x64:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		MULRND_F32(c_float_0p3,0,3);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		// c[1, 48-63]
+		MULRND_F32(c_float_1p3,1,3);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		MULRND_F32(c_float_2p2,2,2);
+
+		// c[2, 48-63]
+		MULRND_F32(c_float_2p3,2,3);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		MULRND_F32(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		MULRND_F32(c_float_3p2,3,2);
+
+		// c[3, 48-63]
+		MULRND_F32(c_float_3p3,3,3);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_4x64_DISABLE:
+	;
+
+	// Case where the output C matrix is bf16 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3);
+
+		// c[1, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+		// c[1, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3);
+
+		// c[2, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2);
+
+		// c[2, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_2p3,2,3);
+
+		// c[3, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2);
+
+		// c[3, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_3p3,3,3);
+	}
+	
+	// Case where the output C matrix is float
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 );
+
+		// c[3,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 );
+
+		// c[3,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 3*16 ), c_float_3p3 );
+	}
+}
+
+// 3x64 bf16 kernel
+LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x64_DISABLE,
+						  &&POST_OPS_BIAS_3x64,
+						  &&POST_OPS_RELU_3x64,
+						  &&POST_OPS_RELU_SCALE_3x64,
+						  &&POST_OPS_GELU_TANH_3x64,
+						  &&POST_OPS_GELU_ERF_3x64,
+						  &&POST_OPS_CLIP_3x64,
+						  &&POST_OPS_DOWNSCALE_3x64
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+	__m512bh b3;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+	__m512bh a_bf16_1;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+	__m512 c_float_0p3 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+	__m512 c_float_1p2 = _mm512_setzero_ps();
+	__m512 c_float_1p3 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+	__m512 c_float_2p2 = _mm512_setzero_ps();
+	__m512 c_float_2p3 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a *  0 ) + ( cs_a * kr ) ) );
+
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
+	}
+
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-63] = a[2,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+		c_float_2p3 = _mm512_dpbf16_ps( c_float_2p3, a_bf16_0, b3 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+		c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+		c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+		c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
+		c_float_2p3 = _mm512_mul_ps( selector1, c_float_2p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		// For the downscaled api (C-bf16), the output C matrix values
+		// needs to be upscaled to float to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+			// c[1,32-47]
+			BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+			// c[1,48-63]
+			BF16_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2)
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+			// c[2,32-47]
+			BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+			// c[2,48-63]
+			BF16_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+
+			// c[1,0-15]
+			F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+			// c[1,32-47]
+			F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+			// c[1,48-63]
+			F32_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2)
+
+			// c[2,0-15]
+			F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+			// c[2,16-31]
+			F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+			// c[2,32-47]
+			F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+			// c[2,48-63]
+			F32_F32_BETA_OP(c_float_2p3,0,2,3,selector1,selector2)
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x64:
+	{
+		__m512 selector3;
+		__m512 selector4;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+			selector4 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[2,48-63]
+			c_float_2p3 = _mm512_add_ps( selector4, c_float_2p3 );
+		}
+		else
+		{
+			// If original output was columns major, then by the time
+			// kernel sees it, the matrix would be accessed as if it were
+			// transposed. Due to this the bias array will be accessed by
+			// the ic index, and each bias element corresponds to an
+			// entire row of the transposed output array, instead of an
+			// entire column.
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 1 ) );
+			selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 2 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[2,48-63]
+			c_float_2p3 = _mm512_add_ps( selector3, c_float_2p3 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x64:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[0,48-63]
+		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[1,32-47]
+		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+		// c[1,48-63]
+		c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		// c[2,32-47]
+		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
+
+		// c[2,48-63]
+		c_float_2p3 = _mm512_max_ps( selector1, c_float_2p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x64:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x64:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_F32_AVX512(c_float_0p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_F32_AVX512(c_float_1p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_F32_AVX512(c_float_2p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_F32_AVX512(c_float_2p3, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x64:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_F32_AVX512(c_float_0p3, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_F32_AVX512(c_float_1p3, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_F32_AVX512(c_float_2p2, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_F32_AVX512(c_float_2p3, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x64:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_F32_AVX512(c_float_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_F32_AVX512(c_float_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_F32_AVX512(c_float_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_F32_AVX512(c_float_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_F32_AVX512(c_float_2p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x64:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		MULRND_F32(c_float_0p3,0,3);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		// c[1, 48-63]
+		MULRND_F32(c_float_1p3,1,3);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		MULRND_F32(c_float_2p2,2,2);
+
+		// c[2, 48-63]
+		MULRND_F32(c_float_2p3,2,3);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x64_DISABLE:
+	;
+	// Case where the output C matrix is bf16 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3);
+
+		// c[1, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+		// c[1, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3);
+
+		// c[2, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2);
+
+		// c[2, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_2p3,2,3);
+	}
+	
+	// Case where the output C matrix is float
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 3*16 ), c_float_2p3 );
+	}
+}
+
+// 2x64 bf16 kernel
+LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x64_DISABLE,
+						  &&POST_OPS_BIAS_2x64,
+						  &&POST_OPS_RELU_2x64,
+						  &&POST_OPS_RELU_SCALE_2x64,
+						  &&POST_OPS_GELU_TANH_2x64,
+						  &&POST_OPS_GELU_ERF_2x64,
+						  &&POST_OPS_CLIP_2x64,
+						  &&POST_OPS_DOWNSCALE_2x64
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+	__m512bh b3;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+	__m512bh a_bf16_1;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+	__m512 c_float_0p3 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+	__m512 c_float_1p2 = _mm512_setzero_ps();
+	__m512 c_float_1p3 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_1 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+	}
+
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_1 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-63] = a[1,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_1, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_1, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_1, b2 );
+		c_float_1p3 = _mm512_dpbf16_ps( c_float_1p3, a_bf16_1, b3 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+		c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+		c_float_1p3 = _mm512_mul_ps( selector1, c_float_1p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		// For the downscaled api (C-bf16), the output C matrix values
+		// needs to be upscaled to float to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+			// c[1,32-47]
+			BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+			// c[1,48-63]
+			BF16_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+
+			// c[1,0-15]
+			F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+			// c[1,16-31]
+			F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+			// c[1,32-47]
+			F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+			// c[1,48-63]
+			F32_F32_BETA_OP(c_float_1p3,0,1,3,selector1,selector2)
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x64:
+	{
+		__m512 selector3;
+		__m512 selector4;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+			selector4 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_add_ps( selector4, c_float_1p3 );
+		}
+		else
+		{
+			// If original output was columns major, then by the time
+			// kernel sees it, the matrix would be accessed as if it were
+			// transposed. Due to this the bias array will be accessed by
+			// the ic index, and each bias element corresponds to an
+			// entire row of the transposed output array, instead of an
+			// entire column.
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 1 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+			// c[1,48-63]
+			c_float_1p3 = _mm512_add_ps( selector2, c_float_1p3 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x64:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[0,48-63]
+		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[1,32-47]
+		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+		// c[1,48-63]
+		c_float_1p3 = _mm512_max_ps( selector1, c_float_1p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x64:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x64:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_F32_AVX512(c_float_0p3, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_F32_AVX512(c_float_1p3, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x64:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_F32_AVX512(c_float_0p3, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_F32_AVX512(c_float_1p3, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x64:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_F32_AVX512(c_float_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_F32_AVX512(c_float_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_F32_AVX512(c_float_1p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x64:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		MULRND_F32(c_float_0p3,0,3);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		// c[1, 48-63]
+		MULRND_F32(c_float_1p3,1,3);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x64_DISABLE:
+	;
+
+	// Case where the output C matrix is bf16 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3);
+
+		// c[1, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+		// c[1, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_1p3,1,3);
+	}
+	
+	// Case where the output C matrix is float
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 3*16 ), c_float_1p3 );
+	}
+}
+
+// 1x64 bf16 kernel
+LPGEMM_M_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x64_DISABLE,
+						  &&POST_OPS_BIAS_1x64,
+						  &&POST_OPS_RELU_1x64,
+						  &&POST_OPS_RELU_SCALE_1x64,
+						  &&POST_OPS_GELU_TANH_1x64,
+						  &&POST_OPS_GELU_ERF_1x64,
+						  &&POST_OPS_CLIP_1x64,
+						  &&POST_OPS_DOWNSCALE_1x64
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	//  Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+	__m512 c_float_0p3 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512bh b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr]
+		__m512bh a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		__m512bh b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		__m512bh b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		__m512bh b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+        // c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+	}
+
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m512bh b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		__m512bh a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		__m512bh b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		__m512bh b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		__m512bh b3 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 2.
+        // c[0,0-63] = a[0,kr:kr+2]*b[kr:kr+2,0-63]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+		c_float_0p3 = _mm512_dpbf16_ps( c_float_0p3, a_bf16_0, b3 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+		c_float_0p3 = _mm512_mul_ps( selector1, c_float_0p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0)
+	{
+		// For the downscaled api (C-bf16), the output C matrix values
+		// needs to be upscaled to float to be used for beta scale.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0,0-15]
+			BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			BF16_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+			// c[0,32-47]
+			F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+			// c[0,48-63]
+			F32_F32_BETA_OP(c_float_0p3,0,0,3,selector1,selector2)
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x64:
+	{
+		__m512 selector3;
+		__m512 selector4;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+			selector4 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector4, c_float_0p3 );
+		}
+		else
+		{
+			// If original output was columns major, then by the time
+			// kernel sees it, the matrix would be accessed as if it were
+			// transposed. Due to this the bias array will be accessed by
+			// the ic index, and each bias element corresponds to an
+			// entire row of the transposed output array, instead of an
+			// entire column.
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_i + 0 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[0,48-63]
+			c_float_0p3 = _mm512_add_ps( selector1, c_float_0p3 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x64:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[0,48-63]
+		c_float_0p3 = _mm512_max_ps( selector1, c_float_0p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x64:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x64:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_F32_AVX512(c_float_0p3, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x64:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_F32_AVX512(c_float_0p3, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x64:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_F32_AVX512(c_float_0p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x64:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		MULRND_F32(c_float_0p3,0,3);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x64_DISABLE:
+	;
+	// Case where the output C matrix is bf16 (downscaled) and this is the
+	// final write for a given block within C.
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+		
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[0, 48-63]
+		CVT_STORE_F32_BF16_MASK(c_float_0p3,0,3);
+	}
+	
+	// Case where the output C matrix is float
+	else
+	{
+		// Store the accumulated results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 3*16 ), c_float_0p3 );
+	}
+}
+#endif
+#endif
diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c
similarity index 54%
rename from addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c
rename to kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c
index 6e985f154f..01b59d38cf 100644
--- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c
+++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_mn_fringe_bf16_amd512vnni.c
@@ -1,5843 +1,8099 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <immintrin.h>
-#include <string.h>
-
-#include "blis.h"
-#include "lpgemm_kernels.h"
-#include "lpgemm_f32_kern_macros.h"
-
-#ifdef BLIS_KERNELS_ZEN4
-// 5xlt16 bf16 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5xLT16_DISABLE,
-						  &&POST_OPS_BIAS_5xLT16,
-						  &&POST_OPS_RELU_5xLT16,
-						  &&POST_OPS_RELU_SCALE_5xLT16,
-						  &&POST_OPS_DOWNSCALE_5xLT16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-    // B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// For corner cases.
-	float buf0[16];
-	float buf1[16];
-	float buf2[16];
-	float buf3[16];
-	float buf4[16];
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-		
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-
-	__m512 c_float_4p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			
-		// Broadcast a[4,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.        
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-			
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-			
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-			
-		// Broadcast a[3,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-			
-		// Broadcast a[4,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-	}
-        
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-	
-	c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-		
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf3, ( c + ( rs_c * 3 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf4, ( c + ( rs_c * 4 ) ), ( n0_rem * sizeof( float ) ) );
-		
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( buf0 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( buf1 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( buf2 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( buf3 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_ps( buf4 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-	}
-	// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5xLT16:
-		{
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-				selector1 = _mm512_loadu_ps( buf0 );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 1 ) );
-				__m512 selector3 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 2 ) );
-				__m512 selector4 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 3 ) );
-				__m512 selector5 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 4 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-				// c[4,0-15]
-				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_5xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_5xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-			// c[4, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_5xLT16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16_LT16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16_LT16(c_float_1p0,1,0);
-
-		// c[2, 0-15]
-		CVT_F32_BF16_LT16(c_float_2p0,2,0);
-
-		// c[3, 0-15]
-		CVT_F32_BF16_LT16(c_float_3p0,3,0);
-
-		// c[4, 0-15]
-		CVT_F32_BF16_LT16(c_float_4p0,4,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_5xLT16_DISABLE:
-		;
-		
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( buf0, c_float_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( buf1, c_float_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( buf2, c_float_2p0 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( buf3, c_float_3p0 );
-
-	// c[4,0-15]
-	_mm512_storeu_ps( buf4, c_float_4p0 );
-
-	// Memcpy partial parts.
-	// c[0,0-15]
-	memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) );
-
-	// c[1,0-15]
-	memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) );
-
-	// c[2,0-15]
-	memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( float ) ) );
-
-	// c[3,0-15]
-	memcpy( c + ( rs_c * 3 ) + ( 0*16 ), buf3, ( n0_rem * sizeof( float ) ) );
-
-	// c[4,0-15]
-	memcpy( c + ( rs_c * 4 ) + ( 0*16 ), buf4, ( n0_rem * sizeof( float ) ) );
-	
-}
-
-// 4xlt16 bf16 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4xLT16_DISABLE,
-						  &&POST_OPS_BIAS_4xLT16,
-						  &&POST_OPS_RELU_4xLT16,
-						  &&POST_OPS_RELU_SCALE_4xLT16,
-						  &&POST_OPS_DOWNSCALE_4xLT16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-    // B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// For corner cases.
-	float buf0[16];
-	float buf1[16];
-	float buf2[16];
-	float buf3[16];
-
-	
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.
-	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-		
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf3, ( c + ( rs_c * 3 ) ), ( n0_rem * sizeof( float ) ) );
-		
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( buf0 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( buf1 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( buf2 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( buf3 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-	}
-	// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4xLT16:
-		{
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-				selector1 = _mm512_loadu_ps( buf0 );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 1 ) );
-				__m512 selector3 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 2 ) );
-				__m512 selector4 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 3 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-				// c[3,0-15]
-				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_4xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_4xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-			// c[3, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_4xLT16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16_LT16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16_LT16(c_float_1p0,1,0);
-
-		// c[2, 0-15]
-		CVT_F32_BF16_LT16(c_float_2p0,2,0);
-
-		// c[3, 0-15]
-		CVT_F32_BF16_LT16(c_float_3p0,3,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_4xLT16_DISABLE:
-		;
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( buf0, c_float_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( buf1, c_float_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( buf2, c_float_2p0 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( buf3, c_float_3p0 );
-
-	// Memcpy partial parts.
-	// c[0,0-15]
-	memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) );
-
-	// c[1,0-15]
-	memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) );
-
-	// c[2,0-15]
-	memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( float ) ) );
-
-	// c[3,0-15]
-	memcpy( c + ( rs_c * 3 ) + ( 0*16 ), buf3, ( n0_rem * sizeof( float ) ) );
-	
-}
-
-// 3xlt16 bf16 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3xLT16_DISABLE,
-						  &&POST_OPS_BIAS_3xLT16,
-						  &&POST_OPS_RELU_3xLT16,
-						  &&POST_OPS_RELU_SCALE_3xLT16,
-						  &&POST_OPS_DOWNSCALE_3xLT16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-    // B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// For corner cases.
-	float buf0[16];
-	float buf1[16];
-	float buf2[16];
-
-	
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.		
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf2, ( c + ( rs_c * 2 ) ), ( n0_rem * sizeof( float) ) );
-		
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( buf0 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( buf1 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( buf2 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-	}
-	// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3xLT16:
-		{
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-				selector1 = _mm512_loadu_ps( buf0 );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 1 ) );
-				__m512 selector3 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 2 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-				// c[2,0-15]
-				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_3xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_3xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			// c[2, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_3xLT16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16_LT16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16_LT16(c_float_1p0,1,0);
-
-		// c[2, 0-15]
-		CVT_F32_BF16_LT16(c_float_2p0,2,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_3xLT16_DISABLE:
-		;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( buf0, c_float_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( buf1, c_float_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( buf2, c_float_2p0 );
-
-	// Memcpy partial parts.
-	// c[0,0-15]
-	memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) );
-
-	// c[1,0-15]
-	memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) );
-
-	// c[2,0-15]
-	memcpy( c + ( rs_c * 2 ) + ( 0*16 ), buf2, ( n0_rem * sizeof( float ) ) );
-	
-}
-
-// 2xlt16 bf16 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2xLT16_DISABLE,
-						  &&POST_OPS_BIAS_2xLT16,
-						  &&POST_OPS_RELU_2xLT16,
-						  &&POST_OPS_RELU_SCALE_2xLT16,
-						  &&POST_OPS_DOWNSCALE_2xLT16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-    // B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// For corner cases.
-	float buf0[16];
-	float buf1[16];
-
-	
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.		
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) );
-		memcpy( buf1, ( c + ( rs_c * 1 ) ), ( n0_rem * sizeof( float) ) );
-		
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( buf0 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( buf1 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-	}
-	// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2xLT16:
-		{
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-				selector1 = _mm512_loadu_ps( buf0 );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-				selector2 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 1 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-				// c[1,0-15]
-				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_2xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_2xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			// c[1, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_2xLT16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16_LT16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16_LT16(c_float_1p0,1,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_2xLT16_DISABLE:
-		;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( buf0, c_float_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( buf1, c_float_1p0 );
-
-	// Memcpy partial parts.
-	// c[0,0-15]
-	memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) );
-
-	// c[1,0-15]
-	memcpy( c + ( rs_c * 1 ) + ( 0*16 ), buf1, ( n0_rem * sizeof( float ) ) );
-	
-}
-
-// 1xlt16 bf16 fringe kernel
-LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1xlt16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1xLT16_DISABLE,
-						  &&POST_OPS_BIAS_1xLT16,
-						  &&POST_OPS_RELU_1xLT16,
-						  &&POST_OPS_RELU_SCALE_1xLT16,
-						  &&POST_OPS_DOWNSCALE_1xLT16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-    // B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// For corner cases.
-	float buf0[16];
-
-	
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.		
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		memcpy( buf0, ( c + ( rs_c * 0 ) ), ( n0_rem * sizeof( float ) ) );
-		
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( buf0 );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-	}
-	// Post Ops
-		lpgemm_post_op* post_ops_list_temp = post_ops_list;
-		POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1xLT16:
-		{
-			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-			{
-				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
-						post_op_c_j ), ( n0_rem * sizeof( float ) ) );
-				selector1 = _mm512_loadu_ps( buf0 );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-			}
-			else
-			{
-				selector1 =
-					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-								+ post_op_c_i + 0 ) );
-
-				// c[0,0-15]
-				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-			}
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_1xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_RELU_SCALE_1xLT16:
-		{
-			selector1 = _mm512_setzero_ps();
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-			__mmask16 relu_cmp_mask;
-
-			// c[0, 0-15]
-			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-		}
-POST_OPS_DOWNSCALE_1xLT16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16_LT16(c_float_0p0,0,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}		
-POST_OPS_1xLT16_DISABLE:
-		;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( buf0, c_float_0p0 );
-
-	// Memcpy partial parts.
-	// c[0,0-15]
-	memcpy( c + ( rs_c * 0 ) + ( 0*16 ), buf0, ( n0_rem * sizeof( float ) ) );
-	
-}
-
-// 5x16 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5x16_DISABLE,
-						  &&POST_OPS_BIAS_5x16,
-						  &&POST_OPS_RELU_5x16,
-						  &&POST_OPS_RELU_SCALE_5x16,
-						  &&POST_OPS_DOWNSCALE_5x16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-
-	__m512 c_float_4p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		
-		// Broadcast a[4,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		
-		// Broadcast a[4,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-	
-	c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5x16:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			__m512 selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-			__m512 selector4 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 3 ) );
-			__m512 selector5 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 4 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_5x16:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[3,0-15]
-		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-		// c[4,0-15]
-		c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_5x16:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-		// c[4, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_5x16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[4, 0-15]
-		CVT_F32_BF16(c_float_4p0,4,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_5x16_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
-
-	// c[4,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 );
-}
-
-// 4x16 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4x16_DISABLE,
-						  &&POST_OPS_BIAS_4x16,
-						  &&POST_OPS_RELU_4x16,
-						  &&POST_OPS_RELU_SCALE_4x16,
-						  &&POST_OPS_DOWNSCALE_4x16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		
-		// Broadcast a[3,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4x16:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			__m512 selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-			__m512 selector4 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 3 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_4x16:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[3,0-15]
-		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_4x16:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_4x16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_4x16_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
-}
-
-// 3x16 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3x16_DISABLE,
-						  &&POST_OPS_BIAS_3x16,
-						  &&POST_OPS_RELU_3x16,
-						  &&POST_OPS_RELU_SCALE_3x16,
-						  &&POST_OPS_DOWNSCALE_3x16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3x16:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			__m512 selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_3x16:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_3x16:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_3x16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_3x16_DISABLE:
-	;
-
-
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-}
-
-// 2x16 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2x16_DISABLE,
-						  &&POST_OPS_BIAS_2x16,
-						  &&POST_OPS_RELU_2x16,
-						  &&POST_OPS_RELU_SCALE_2x16,
-						  &&POST_OPS_DOWNSCALE_2x16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2x16:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_2x16:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_2x16:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_2x16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_2x16_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-}
-
-// 1x16 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x16)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1x16_DISABLE,
-						  &&POST_OPS_BIAS_1x16,
-						  &&POST_OPS_RELU_1x16,
-						  &&POST_OPS_RELU_SCALE_1x16,
-						  &&POST_OPS_DOWNSCALE_1x16
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-	}
-    
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1x16:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_1x16:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_1x16:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_1x16:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_1x16_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-}
-
-// 5x32 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5x32_DISABLE,
-						  &&POST_OPS_BIAS_5x32,
-						  &&POST_OPS_RELU_5x32,
-						  &&POST_OPS_RELU_SCALE_5x32,
-						  &&POST_OPS_DOWNSCALE_5x32
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-	
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-	__m512 c_float_3p1 = _mm512_setzero_ps();
-
-	__m512 c_float_4p0 = _mm512_setzero_ps();
-	__m512 c_float_4p1 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-		
-		// Broadcast a[4,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-	}	
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		
-		// Broadcast a[3,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-		
-		// Broadcast a[4,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-	c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-	
-	c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-	c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-		// c[4,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5x32:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-			// c[4, 16-31]
-			c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			__m512 selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-			__m512 selector4 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 3 ) );
-			__m512 selector5 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 4 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-
-			// c[4, 16-31]
-			c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_5x32:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		// c[3,0-15]
-		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-		// c[4,0-15]
-		c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-		// c[4,16-31]
-		c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_5x32:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-		// c[4, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-		// c[4, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_5x32:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[3, 16-31]
-		CVT_F32_BF16(c_float_3p1,3,1);
-
-		// c[4, 0-15]
-		CVT_F32_BF16(c_float_4p0,4,0);
-
-		// c[4, 16-31]
-		CVT_F32_BF16(c_float_4p1,4,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_5x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
-
-	// c[4,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 );
-
-	// c[4,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 );
-}
-
-// 4x32 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4x32_DISABLE,
-						  &&POST_OPS_BIAS_4x32,
-						  &&POST_OPS_RELU_4x32,
-						  &&POST_OPS_RELU_SCALE_4x32,
-						  &&POST_OPS_DOWNSCALE_4x32
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-	
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-	__m512 c_float_3p1 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		
-		// Broadcast a[3,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-	c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4x32:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			__m512 selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-			__m512 selector4 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 3 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_4x32:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		// c[3,0-15]
-		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_4x32:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_4x32:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[3, 16-31]
-		CVT_F32_BF16(c_float_3p1,3,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_4x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
-}
-
-// 3x32 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3x32_DISABLE,
-						  &&POST_OPS_BIAS_3x32,
-						  &&POST_OPS_RELU_3x32,
-						  &&POST_OPS_RELU_SCALE_3x32,
-						  &&POST_OPS_DOWNSCALE_3x32
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3x32:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			__m512 selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_3x32:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_3x32:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_3x32:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_3x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-}
-
-// 2x32 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2x32_DISABLE,
-						  &&POST_OPS_BIAS_2x32,
-						  &&POST_OPS_RELU_2x32,
-						  &&POST_OPS_RELU_SCALE_2x32,
-						  &&POST_OPS_DOWNSCALE_2x32
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2x32:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_2x32:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_2x32:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_2x32:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_2x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-}
-
-// 1x32 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x32)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1x32_DISABLE,
-						  &&POST_OPS_BIAS_1x32,
-						  &&POST_OPS_RELU_1x32,
-						  &&POST_OPS_RELU_SCALE_1x32,
-						  &&POST_OPS_DOWNSCALE_1x32
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1x32:
-	{
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_1x32:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_1x32:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_1x32:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_1x32_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-}
-
-// 5x48 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_5x48_DISABLE,
-						  &&POST_OPS_BIAS_5x48,
-						  &&POST_OPS_RELU_5x48,
-						  &&POST_OPS_RELU_SCALE_5x48,
-						  &&POST_OPS_DOWNSCALE_5x48
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-    // B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-	__m512 c_float_1p2 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-	__m512 c_float_2p2 = _mm512_setzero_ps();
-	
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-	__m512 c_float_3p1 = _mm512_setzero_ps();
-	__m512 c_float_3p2 = _mm512_setzero_ps();
-
-	__m512 c_float_4p0 = _mm512_setzero_ps();
-	__m512 c_float_4p1 = _mm512_setzero_ps();
-	__m512 c_float_4p2 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
-		
-		// Broadcast a[4,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-		c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		
-		// Broadcast a[3,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
-		
-		// Broadcast a[4,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
-		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
-		c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
-	}
-    
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-	c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-	c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-	c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
-	
-	c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
-	c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
-	c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-
-		// c[3,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 );
-
-		// c[4,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-		// c[4,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p1 = _mm512_add_ps( selector1, c_float_4p1 );
-
-		// c[4,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 4 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_4p2 = _mm512_add_ps( selector1, c_float_4p2 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_5x48:
-	{
-		__m512 selector3;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
-
-			// c[4, 16-31]
-			c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
-
-			// c[4,32-47]
-			c_float_4p2 = _mm512_add_ps( selector3, c_float_4p2 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-			__m512 selector4 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 3 ) );
-			__m512 selector5 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 4 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
-
-			// c[4,0-15]
-			c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
-
-			// c[4, 16-31]
-			c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
-
-			// c[4,32-47]
-			c_float_4p2 = _mm512_add_ps( selector5, c_float_4p2 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_5x48:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
-
-		// c[3,0-15]
-		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-		// c[3,32-47]
-		c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
-
-		// c[4,0-15]
-		c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
-
-		// c[4,16-31]
-		c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
-
-		// c[4,32-47]
-		c_float_4p2 = _mm512_max_ps( selector1, c_float_4p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_5x48:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-		// c[3, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p2)
-
-		// c[4, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p0)
-
-		// c[4, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p1)
-
-		// c[4, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_4p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_5x48:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[2, 32-47]
-		CVT_F32_BF16(c_float_2p2,2,2);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[3, 16-31]
-		CVT_F32_BF16(c_float_3p1,3,1);
-
-		// c[3, 32-47]
-		CVT_F32_BF16(c_float_3p2,3,2);
-
-		// c[4, 0-15]
-		CVT_F32_BF16(c_float_4p0,4,0);
-
-		// c[4, 16-31]
-		CVT_F32_BF16(c_float_4p1,4,1);
-
-		// c[4, 32-47]
-		CVT_F32_BF16(c_float_4p2,4,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_5x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
-
-	// c[3,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 );
-
-	// c[4,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 );
-
-	// c[4,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 );
-
-	// c[4,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 2*16 ), c_float_4p2 );
-}
-
-// 4x48 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_4x48_DISABLE,
-						  &&POST_OPS_BIAS_4x48,
-						  &&POST_OPS_RELU_4x48,
-						  &&POST_OPS_RELU_SCALE_4x48,
-						  &&POST_OPS_DOWNSCALE_4x48
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-	__m512 c_float_1p2 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-	__m512 c_float_2p2 = _mm512_setzero_ps();
-	
-	__m512 c_float_3p0 = _mm512_setzero_ps();
-	__m512 c_float_3p1 = _mm512_setzero_ps();
-	__m512 c_float_3p2 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		
-		// Broadcast a[3,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-		
-		// Broadcast a[3,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
-		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
-		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-	c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
-	
-	c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
-	c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
-	c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 );
-
-		// c[3,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p1 = _mm512_add_ps( selector1, c_float_3p1 );
-
-		// c[3,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 3 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_3p2 = _mm512_add_ps( selector1, c_float_3p2 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_4x48:
-	{
-		__m512 selector3;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-			__m512 selector4 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 3 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-
-			// c[3,0-15]
-			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
-
-			// c[3, 16-31]
-			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
-
-			// c[3,32-47]
-			c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_4x48:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
-
-		// c[3,0-15]
-		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
-
-		// c[3,16-31]
-		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
-
-		// c[3,32-47]
-		c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_4x48:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
-
-		// c[3, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
-
-		// c[3, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
-
-		// c[3, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_3p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_4x48:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[2, 32-47]
-		CVT_F32_BF16(c_float_2p2,2,2);
-
-		// c[3, 0-15]
-		CVT_F32_BF16(c_float_3p0,3,0);
-
-		// c[3, 16-31]
-		CVT_F32_BF16(c_float_3p1,3,1);
-
-		// c[3, 32-47]
-		CVT_F32_BF16(c_float_3p2,3,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_4x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
-
-	// c[3,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
-
-	// c[3,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
-
-	// c[3,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 );
-}
-
-// 3x48 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_3x48_DISABLE,
-						  &&POST_OPS_BIAS_3x48,
-						  &&POST_OPS_RELU_3x48,
-						  &&POST_OPS_RELU_SCALE_3x48,
-						  &&POST_OPS_DOWNSCALE_3x48
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-	__m512 c_float_1p2 = _mm512_setzero_ps();
-
-	__m512 c_float_2p0 = _mm512_setzero_ps();
-	__m512 c_float_2p1 = _mm512_setzero_ps();
-	__m512 c_float_2p2 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-		
-		// Broadcast a[2,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-		
-		// Broadcast a[2,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
-		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
-		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
-	}
-
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-	
-	c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
-	c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
-	c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-
-		// c[2,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p1 = _mm512_add_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 2 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_2p2 = _mm512_add_ps( selector1, c_float_2p2 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_3x48:
-	{
-		__m512 selector3;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-			selector3 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 2 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-
-			// c[2,0-15]
-			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
-
-			// c[2, 16-31]
-			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
-
-			// c[2,32-47]
-			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_3x48:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-		// c[2,0-15]
-		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
-
-		// c[2,16-31]
-		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
-
-		// c[2,32-47]
-		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_3x48:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-		// c[2, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
-
-		// c[2, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
-
-		// c[2, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_3x48:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		// c[2, 0-15]
-		CVT_F32_BF16(c_float_2p0,2,0);
-
-		// c[2, 16-31]
-		CVT_F32_BF16(c_float_2p1,2,1);
-
-		// c[2, 32-47]
-		CVT_F32_BF16(c_float_2p2,2,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_3x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
-
-	// c[2,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
-
-	// c[2,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
-
-	// c[2,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
-}
-
-// 2x48 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_2x48_DISABLE,
-						  &&POST_OPS_BIAS_2x48,
-						  &&POST_OPS_RELU_2x48,
-						  &&POST_OPS_RELU_SCALE_2x48,
-						  &&POST_OPS_DOWNSCALE_2x48
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-
-	__m512 c_float_1p0 = _mm512_setzero_ps();
-	__m512 c_float_1p1 = _mm512_setzero_ps();
-	__m512 c_float_1p2 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		
-		// Broadcast a[1,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-	}
-	// Handle k remainder.	
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-		
-		// Broadcast a[1,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
-		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
-		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
-	}
-	
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-
-	c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
-	c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
-	c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-		// c[1,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p1 = _mm512_add_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 1 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_1p2 = _mm512_add_ps( selector1, c_float_1p2 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_2x48:
-	{
-		__m512 selector3;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-			selector2 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 1 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-
-			// c[1,0-15]
-			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
-
-			// c[1, 16-31]
-			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
-
-			// c[1,32-47]
-			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_2x48:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		// c[1,0-15]
-		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
-
-		// c[1,16-31]
-		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
-
-		// c[1,32-47]
-		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_2x48:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		// c[1, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
-
-		// c[1, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
-
-		// c[1, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_2x48:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		// c[1, 0-15]
-		CVT_F32_BF16(c_float_1p0,1,0);
-
-		// c[1, 16-31]
-		CVT_F32_BF16(c_float_1p1,1,1);
-
-		// c[1, 32-47]
-		CVT_F32_BF16(c_float_1p2,1,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_2x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-
-	// c[1,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
-
-	// c[1,16-31]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
-
-	// c[1,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
-}
-
-// 1x48 bf16 kernel
-LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x48)
-{
-	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_1x48_DISABLE,
-						  &&POST_OPS_BIAS_1x48,
-						  &&POST_OPS_RELU_1x48,
-						  &&POST_OPS_RELU_SCALE_1x48,
-						  &&POST_OPS_DOWNSCALE_1x48
-						};
-	dim_t k_full_pieces = k0 / 2;
-	dim_t k_partial_pieces = k0 % 2;
-
-	int32_t a_kfringe_buf = 0;
-
-	// B matrix storage bfloat type
-	__m512bh b0;
-	__m512bh b1;
-	__m512bh b2;
-
-	// A matrix storage bfloat type
-	__m512bh a_bf16_0;
-
-	// Registers to use for accumulating C.
-	__m512 c_float_0p0 = _mm512_setzero_ps();
-	__m512 c_float_0p1 = _mm512_setzero_ps();
-	__m512 c_float_0p2 = _mm512_setzero_ps();
-
-	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-	}
-	// Handle k remainder.
-	if ( k_partial_pieces > 0 )
-	{
-		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-
-		// Broadcast a[0,kr:kr+2].
-		memcpy( &a_kfringe_buf, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ), ( k_partial_pieces * sizeof( bfloat16 ) ) );
-		a_bf16_0 = (__m512bh)_mm512_set1_epi32( a_kfringe_buf );
-
-		// Perform column direction mat-mul with k = 2.
-		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
-		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
-		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
-		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
-	}
-    
-	// Load alpha and beta
-	__m512 selector1 = _mm512_set1_ps( alpha );
-	__m512 selector2 = _mm512_set1_ps( beta );
-
-	// Scale by alpha
-	c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
-	c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
-	c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
-
-	// Scale C by beta.
-	if ( beta != 0 )
-	{
-		// c[0,0-15]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 0*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 1*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		selector1 = _mm512_loadu_ps( c + ( rs_c * 0 ) + ( 2*16 ) );
-		selector1 = _mm512_mul_ps( selector2, selector1 );
-		c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-	}
-	// Post Ops
-	lpgemm_post_op* post_ops_list_temp = post_ops_list;
-	POST_OP_LABEL_LASTK_SAFE_JUMP
-POST_OPS_BIAS_1x48:
-	{
-		__m512 selector3;
-
-		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
-			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
-		{
-			selector1 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 0 * 16 ) );
-			selector2 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 1 * 16 ) );
-			selector3 =
-				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
-							post_op_c_j + ( 2 * 16 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
-		}
-		else
-		{
-			selector1 =
-				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
-							+ post_op_c_i + 0 ) );
-
-			// c[0,0-15]
-			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
-
-			// c[0, 16-31]
-			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
-
-			// c[0,32-47]
-			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
-		}
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_1x48:
-	{
-		selector1 = _mm512_setzero_ps();
-
-		// c[0,0-15]
-		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
-
-		// c[0, 16-31]
-		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
-
-		// c[0,32-47]
-		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_RELU_SCALE_1x48:
-	{
-		selector1 = _mm512_setzero_ps();
-		selector2 =
-			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
-
-		__mmask16 relu_cmp_mask;
-
-		// c[0, 0-15]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
-
-		// c[0, 16-31]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
-
-		// c[0, 32-47]
-		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}
-POST_OPS_DOWNSCALE_1x48:
-	{
-		// c[0, 0-15]
-		CVT_F32_BF16(c_float_0p0,0,0);
-
-		// c[0, 16-31]
-		CVT_F32_BF16(c_float_0p1,0,1);
-
-		// c[0, 32-47]
-		CVT_F32_BF16(c_float_0p2,0,2);
-
-		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
-	}	
-POST_OPS_1x48_DISABLE:
-	;
-	
-	// Store the results.
-	// c[0,0-15]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
-
-	// c[0, 16-31]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
-
-	// c[0,32-47]
-	_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
-}
-#endif
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_f32_kern_macros.h"
+
+#ifndef LPGEMM_BF16_NOT_SUPPORTED
+// 5xlt16 bf16 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5xLT16_DISABLE,
+						  &&POST_OPS_BIAS_5xLT16,
+						  &&POST_OPS_RELU_5xLT16,
+						  &&POST_OPS_RELU_SCALE_5xLT16,
+						  &&POST_OPS_GELU_TANH_5xLT16,
+						  &&POST_OPS_GELU_ERF_5xLT16,
+						  &&POST_OPS_CLIP_5xLT16,
+						  &&POST_OPS_DOWNSCALE_5xLT16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+    // B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// For corner cases.
+	float buf0[16];
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+
+	__m512 c_float_4p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+
+		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \
+							selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \
+							selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_2p0, 2, 0, \
+							selector1, selector2 );
+
+			// c[3,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_3p0, 3, 0, \
+							selector1, selector2 );
+
+			// c[4,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_4p0, 4, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+
+			// c[1,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, 0, 1, 0, \
+							selector1, selector2);
+
+			// c[2,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_2p0, 0, 2, 0, \
+							selector1, selector2);
+
+			// c[3,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_3p0, 0, 3, 0, \
+							selector1, selector2);
+
+			// c[4,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_4p0, 0, 4, 0, \
+							selector1, selector2);
+		}
+	}
+	// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5xLT16:
+		{
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				selector1 = _mm512_loadu_ps( buf0 );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 1 ) );
+				__m512 selector3 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 2 ) );
+				__m512 selector4 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 3 ) );
+				__m512 selector5 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 4 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_5xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_5xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_5xLT16:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_5xLT16:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_5xLT16:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_F32_AVX512(c_float_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_F32_AVX512(c_float_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_F32_AVX512(c_float_4p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_5xLT16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5xLT16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+		}
+
+	else
+	{
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+		// Store the results.
+		// c[0,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 );
+
+		// c[1,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 1 ), load_mask, c_float_1p0 );
+
+		// c[2,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 2 ), load_mask, c_float_2p0 );
+
+		// c[3,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 3 ), load_mask, c_float_3p0 );
+
+		// c[4,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 4 ), load_mask, c_float_4p0 );
+	}
+}
+
+// 4xlt16 bf16 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4xLT16_DISABLE,
+						  &&POST_OPS_BIAS_4xLT16,
+						  &&POST_OPS_RELU_4xLT16,
+						  &&POST_OPS_RELU_SCALE_4xLT16,
+						  &&POST_OPS_GELU_TANH_4xLT16,
+						  &&POST_OPS_GELU_ERF_4xLT16,
+						  &&POST_OPS_CLIP_4xLT16,
+						  &&POST_OPS_DOWNSCALE_4xLT16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+    // B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// For corner cases.
+	float buf0[16];
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+	}
+	
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \
+							selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \
+							selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_2p0, 2, 0, \
+							selector1, selector2 );
+
+			// c[3,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_3p0, 3, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+
+			// c[1,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, 0, 1, 0, \
+							selector1, selector2);
+
+			// c[2,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_2p0, 0, 2, 0, \
+							selector1, selector2);
+
+			// c[3,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_3p0, 0, 3, 0, \
+							selector1, selector2);
+		}
+	}
+	// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4xLT16:
+		{
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				selector1 = _mm512_loadu_ps( buf0 );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 1 ) );
+				__m512 selector3 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 2 ) );
+				__m512 selector4 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 3 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_4xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_4xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_4xLT16:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_4xLT16:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_4xLT16:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_F32_AVX512(c_float_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_F32_AVX512(c_float_3p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_4xLT16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4xLT16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[3,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+	}
+	else
+	{
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+		// Store the results.
+		// c[0,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 );
+
+		// c[1,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 1 ), load_mask, c_float_1p0 );
+
+		// c[2,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 2 ), load_mask, c_float_2p0 );
+
+		// c[3,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 3 ), load_mask, c_float_3p0 );
+	}
+
+}
+
+// 3xlt16 bf16 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3xLT16_DISABLE,
+						  &&POST_OPS_BIAS_3xLT16,
+						  &&POST_OPS_RELU_3xLT16,
+						  &&POST_OPS_RELU_SCALE_3xLT16,
+						  &&POST_OPS_GELU_TANH_3xLT16,
+						  &&POST_OPS_GELU_ERF_3xLT16,
+						  &&POST_OPS_CLIP_3xLT16,
+						  &&POST_OPS_DOWNSCALE_3xLT16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+    // B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// For corner cases.
+	float buf0[16];
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \
+							selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \
+							selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_2p0, 2, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+
+			// c[1,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, 0, 1, 0, \
+							selector1, selector2);
+
+			// c[2,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_2p0, 0, 2, 0, \
+							selector1, selector2);
+		}
+	}
+	// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3xLT16:
+		{
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				selector1 = _mm512_loadu_ps( buf0 );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 1 ) );
+				__m512 selector3 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 2 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_3xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_3xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_3xLT16:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_3xLT16:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_3xLT16:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_F32_AVX512(c_float_2p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_3xLT16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3xLT16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+	}
+
+	else
+	{
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+		// Store the results.
+		// c[0,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 );
+
+		// c[1,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 1 ), load_mask, c_float_1p0 );
+
+		// c[2,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 2 ), load_mask, c_float_2p0 );
+	}
+
+}
+
+// 2xlt16 bf16 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2xLT16_DISABLE,
+						  &&POST_OPS_BIAS_2xLT16,
+						  &&POST_OPS_RELU_2xLT16,
+						  &&POST_OPS_RELU_SCALE_2xLT16,
+						  &&POST_OPS_GELU_TANH_2xLT16,
+						  &&POST_OPS_GELU_ERF_2xLT16,
+						  &&POST_OPS_CLIP_2xLT16,
+						  &&POST_OPS_DOWNSCALE_2xLT16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+    // B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// For corner cases.
+	float buf0[16];
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \
+							selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+
+			// c[1,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, 0, 1, 0, \
+							selector1, selector2);
+		}
+	}
+	// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2xLT16:
+		{
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				selector1 = _mm512_loadu_ps( buf0 );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 1 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_2xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_2xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_2xLT16:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_2xLT16:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_2xLT16:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_2xLT16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2xLT16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+	}
+
+	else
+	{
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+		// Store the results.
+		// c[0,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 );
+
+		// c[1,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 1 ), load_mask, c_float_1p0 );
+	}
+
+}
+
+// 1xlt16 bf16 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1xLT16_DISABLE,
+						  &&POST_OPS_BIAS_1xLT16,
+						  &&POST_OPS_RELU_1xLT16,
+						  &&POST_OPS_RELU_SCALE_1xLT16,
+						  &&POST_OPS_GELU_TANH_1xLT16,
+						  &&POST_OPS_GELU_ERF_1xLT16,
+						  &&POST_OPS_CLIP_1xLT16,
+						  &&POST_OPS_DOWNSCALE_1xLT16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+    // B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// For corner cases.
+	float buf0[16];
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// c[0,0-15]
+			F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+		}
+	}
+	// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1xLT16:
+		{
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				selector1 = _mm512_loadu_ps( buf0 );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_1xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_1xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_1xLT16:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_1xLT16:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_1xLT16:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_1xLT16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1xLT16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+		}
+
+	else
+	{
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+		// Store the results.
+		// c[0,0-15]
+		_mm512_mask_storeu_ps( c + ( rs_c * 0 ), load_mask, c_float_0p0 );
+	}
+
+}
+
+// 5x16 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x16_DISABLE,
+						  &&POST_OPS_BIAS_5x16,
+						  &&POST_OPS_RELU_5x16,
+						  &&POST_OPS_RELU_SCALE_5x16,
+						  &&POST_OPS_GELU_TANH_5x16,
+						  &&POST_OPS_GELU_ERF_5x16,
+						  &&POST_OPS_CLIP_5x16,
+						  &&POST_OPS_DOWNSCALE_5x16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+
+	__m512 c_float_4p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+
+		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \
+							selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, \
+							selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, \
+							selector1, selector2 );
+
+			// c[3,0-15]
+			BF16_F32_BETA_OP( c_float_3p0, 0, 3, 0, \
+							selector1, selector2 );
+
+			// c[4,0-15]
+			BF16_F32_BETA_OP( c_float_4p0, 0, 4, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+
+			// c[1,0-15]
+			F32_F32_BETA_OP(c_float_1p0, 0, 1, 0, \
+							selector1, selector2);
+
+			// c[2,0-15]
+			F32_F32_BETA_OP(c_float_2p0, 0, 2, 0, \
+							selector1, selector2);
+
+			// c[3,0-15]
+			F32_F32_BETA_OP(c_float_3p0, 0, 3, 0, \
+							selector1, selector2);
+
+			// c[4,0-15]
+			F32_F32_BETA_OP(c_float_4p0, 0, 4, 0, \
+							selector1, selector2);
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x16:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			__m512 selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+			__m512 selector4 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 3 ) );
+			__m512 selector5 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 4 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x16:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[3,0-15]
+		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+		// c[4,0-15]
+		c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x16:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x16:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x16:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x16:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[3, 0-15]
+		CLIP_F32_AVX512(c_float_3p0, min, max)
+
+		// c[4, 0-15]
+		CLIP_F32_AVX512(c_float_4p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[3,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+		// c[4,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[3,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
+
+		// c[4,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 );
+	}
+}
+
+// 4x16 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x16_DISABLE,
+						  &&POST_OPS_BIAS_4x16,
+						  &&POST_OPS_RELU_4x16,
+						  &&POST_OPS_RELU_SCALE_4x16,
+						  &&POST_OPS_GELU_TANH_4x16,
+						  &&POST_OPS_GELU_ERF_4x16,
+						  &&POST_OPS_CLIP_4x16,
+						  &&POST_OPS_DOWNSCALE_4x16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \
+							selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, \
+							selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, \
+							selector1, selector2 );
+
+			// c[3,0-15]
+			BF16_F32_BETA_OP( c_float_3p0, 0, 3, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+
+			// c[1,0-15]
+			F32_F32_BETA_OP(c_float_1p0, 0, 1, 0, \
+							selector1, selector2);
+
+			// c[2,0-15]
+			F32_F32_BETA_OP(c_float_2p0, 0, 2, 0, \
+							selector1, selector2);
+
+			// c[3,0-15]
+			F32_F32_BETA_OP(c_float_3p0, 0, 3, 0, \
+							selector1, selector2);
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x16:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			__m512 selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+			__m512 selector4 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 3 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x16:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[3,0-15]
+		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x16:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x16:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x16:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x16:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[3, 0-15]
+		CLIP_F32_AVX512(c_float_3p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[3,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+	}	
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[3,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
+	}
+}
+
+// 3x16 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x16_DISABLE,
+						  &&POST_OPS_BIAS_3x16,
+						  &&POST_OPS_RELU_3x16,
+						  &&POST_OPS_RELU_SCALE_3x16,
+						  &&POST_OPS_GELU_TANH_3x16,
+						  &&POST_OPS_GELU_ERF_3x16,
+						  &&POST_OPS_CLIP_3x16,
+						  &&POST_OPS_DOWNSCALE_3x16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \
+							selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, \
+							selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+
+			// c[1,0-15]
+			F32_F32_BETA_OP(c_float_1p0, 0, 1, 0, \
+							selector1, selector2);
+
+			// c[2,0-15]
+			F32_F32_BETA_OP(c_float_2p0, 0, 2, 0, \
+							selector1, selector2);
+		}
+		
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x16:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			__m512 selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x16:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x16:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x16:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x16:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x16:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+	}	
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+	}
+}
+
+// 2x16 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x16_DISABLE,
+						  &&POST_OPS_BIAS_2x16,
+						  &&POST_OPS_RELU_2x16,
+						  &&POST_OPS_RELU_SCALE_2x16,
+						  &&POST_OPS_GELU_TANH_2x16,
+						  &&POST_OPS_GELU_ERF_2x16,
+						  &&POST_OPS_CLIP_2x16,
+						  &&POST_OPS_DOWNSCALE_2x16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \
+							selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+
+			// c[1,0-15]
+			F32_F32_BETA_OP(c_float_1p0, 0, 1, 0, \
+							selector1, selector2);
+		}
+		
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x16:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x16:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x16:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x16:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x16:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x16:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+	}		
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+	}
+}
+
+// 1x16 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x16_DISABLE,
+						  &&POST_OPS_BIAS_1x16,
+						  &&POST_OPS_RELU_1x16,
+						  &&POST_OPS_RELU_SCALE_1x16,
+						  &&POST_OPS_GELU_TANH_1x16,
+						  &&POST_OPS_GELU_ERF_1x16,
+						  &&POST_OPS_CLIP_1x16,
+						  &&POST_OPS_DOWNSCALE_1x16
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, \
+							selector1, selector2 );
+		}
+		else
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP(c_float_0p0, 0, 0, 0, \
+							selector1, selector2);
+		}
+		
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x16:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x16:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x16:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x16:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x16:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x16:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x16_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+	}		
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+	}
+}
+
+// 5x32 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x32_DISABLE,
+						  &&POST_OPS_BIAS_5x32,
+						  &&POST_OPS_RELU_5x32,
+						  &&POST_OPS_RELU_SCALE_5x32,
+						  &&POST_OPS_GELU_TANH_5x32,
+						  &&POST_OPS_GELU_ERF_5x32,
+						  &&POST_OPS_CLIP_5x32,
+						  &&POST_OPS_DOWNSCALE_5x32
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+	__m512 c_float_3p1 = _mm512_setzero_ps();
+
+	__m512 c_float_4p0 = _mm512_setzero_ps();
+	__m512 c_float_4p1 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );\
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+
+		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+		c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 );
+
+			// c[1, 16-31]
+			BF16_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 );
+
+			// c[2, 16-31]
+			BF16_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 );
+
+			// c[3,0-15]
+			BF16_F32_BETA_OP( c_float_3p0, 0, 3, 0, selector1, selector2 );
+
+			// c[3, 16-31]
+			BF16_F32_BETA_OP( c_float_3p1, 0, 3, 1, selector1, selector2 );
+
+			// c[4,0-15]
+			BF16_F32_BETA_OP( c_float_4p0, 0, 4, 0, selector1, selector2 );
+
+			// c[4, 16-31]
+			BF16_F32_BETA_OP( c_float_4p1, 0, 4, 1, selector1, selector2 );
+		}
+		else 
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+
+			// c[1,0-15]
+			F32_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 );
+
+			// c[1, 16-31]
+			F32_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 );
+
+			// c[2,0-15]
+			F32_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 );
+
+			// c[2, 16-31]
+			F32_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 );
+
+			// c[3,0-15]
+			F32_F32_BETA_OP( c_float_3p0, 0, 3, 0, selector1, selector2 );
+
+			// c[3, 16-31]
+			F32_F32_BETA_OP( c_float_3p1, 0, 3, 1, selector1, selector2 );
+
+			// c[4,0-15]
+			F32_F32_BETA_OP( c_float_4p0, 0, 4, 0, selector1, selector2 );
+
+			// c[4, 16-31]
+			F32_F32_BETA_OP( c_float_4p1, 0, 4, 1, selector1, selector2 );
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x32:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+
+			// c[4, 16-31]
+			c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			__m512 selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+			__m512 selector4 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 3 ) );
+			__m512 selector5 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 4 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+
+			// c[4, 16-31]
+			c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x32:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		// c[3,0-15]
+		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+		// c[3,16-31]
+		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+		// c[4,0-15]
+		c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+		// c[4,16-31]
+		c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x32:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x32:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_F32_AVX512(c_float_4p1, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x32:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_F32_AVX512(c_float_4p1, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x32:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		// c[3, 0-15]
+		CLIP_F32_AVX512(c_float_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_F32_AVX512(c_float_3p1, min, max)
+
+		// c[4, 0-15]
+		CLIP_F32_AVX512(c_float_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_F32_AVX512(c_float_4p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x32:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		MULRND_F32(c_float_3p1,3,1);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		// c[4, 16-31]
+		MULRND_F32(c_float_4p1,4,1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x32_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+		// c[3,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+
+		// c[4,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+
+		// c[4, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+
+		// c[3,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
+
+		// c[4,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 );
+	}
+}
+
+// 4x32 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x32_DISABLE,
+						  &&POST_OPS_BIAS_4x32,
+						  &&POST_OPS_RELU_4x32,
+						  &&POST_OPS_RELU_SCALE_4x32,
+						  &&POST_OPS_GELU_TANH_4x32,
+						  &&POST_OPS_GELU_ERF_4x32,
+						  &&POST_OPS_CLIP_4x32,
+						  &&POST_OPS_DOWNSCALE_4x32
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+	__m512 c_float_3p1 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 );
+
+			// c[1, 16-31]
+			BF16_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 );
+
+			// c[2, 16-31]
+			BF16_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 );
+
+			// c[3,0-15]
+			BF16_F32_BETA_OP( c_float_3p0, 0, 3, 0, selector1, selector2 );
+
+			// c[3, 16-31]
+			BF16_F32_BETA_OP( c_float_3p1, 0, 3, 1, selector1, selector2 );
+		}
+		else 
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+
+			// c[1,0-15]
+			F32_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 );
+
+			// c[1, 16-31]
+			F32_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 );
+
+			// c[2,0-15]
+			F32_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 );
+
+			// c[2, 16-31]
+			F32_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 );
+
+			// c[3,0-15]
+			F32_F32_BETA_OP( c_float_3p0, 0, 3, 0, selector1, selector2 );
+
+			// c[3, 16-31]
+			F32_F32_BETA_OP( c_float_3p1, 0, 3, 1, selector1, selector2 );
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x32:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			__m512 selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+			__m512 selector4 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 3 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x32:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		// c[3,0-15]
+		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+		// c[3,16-31]
+		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x32:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x32:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x32:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x32:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		// c[3, 0-15]
+		CLIP_F32_AVX512(c_float_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_F32_AVX512(c_float_3p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x32:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		MULRND_F32(c_float_3p1,3,1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x32_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+		// c[3,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+	}	
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+
+		// c[3,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
+	}
+}
+
+// 3x32 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x32_DISABLE,
+						  &&POST_OPS_BIAS_3x32,
+						  &&POST_OPS_RELU_3x32,
+						  &&POST_OPS_RELU_SCALE_3x32,
+						  &&POST_OPS_GELU_TANH_3x32,
+						  &&POST_OPS_GELU_ERF_3x32,
+						  &&POST_OPS_CLIP_3x32,
+						  &&POST_OPS_DOWNSCALE_3x32
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 );
+
+			// c[1, 16-31]
+			BF16_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 );
+
+			// c[2,0-15]
+			BF16_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 );
+
+			// c[2, 16-31]
+			BF16_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 );
+		}
+		else 
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+
+			// c[1,0-15]
+			F32_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 );
+
+			// c[1, 16-31]
+			F32_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 );
+
+			// c[2,0-15]
+			F32_F32_BETA_OP( c_float_2p0, 0, 2, 0, selector1, selector2 );
+
+			// c[2, 16-31]
+			F32_F32_BETA_OP( c_float_2p1, 0, 2, 1, selector1, selector2 );
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x32:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			__m512 selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x32:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x32:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x32:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x32:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x32:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x32:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x32_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+	}
+}
+
+// 2x32 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x32_DISABLE,
+						  &&POST_OPS_BIAS_2x32,
+						  &&POST_OPS_RELU_2x32,
+						  &&POST_OPS_RELU_SCALE_2x32,
+						  &&POST_OPS_GELU_TANH_2x32,
+						  &&POST_OPS_GELU_ERF_2x32,
+						  &&POST_OPS_CLIP_2x32,
+						  &&POST_OPS_DOWNSCALE_2x32
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+
+			// c[1,0-15]
+			BF16_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 );
+
+			// c[1, 16-31]
+			BF16_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 );
+		}
+		else 
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+
+			// c[1,0-15]
+			F32_F32_BETA_OP( c_float_1p0, 0, 1, 0, selector1, selector2 );
+
+			// c[1, 16-31]
+			F32_F32_BETA_OP( c_float_1p1, 0, 1, 1, selector1, selector2 );
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x32:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x32:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x32:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x32:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x32:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x32:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x32:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x32_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+	}
+}
+
+// 1x32 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x32_DISABLE,
+						  &&POST_OPS_BIAS_1x32,
+						  &&POST_OPS_RELU_1x32,
+						  &&POST_OPS_RELU_SCALE_1x32,
+						  &&POST_OPS_GELU_TANH_1x32,
+						  &&POST_OPS_GELU_ERF_1x32,
+						  &&POST_OPS_CLIP_1x32,
+						  &&POST_OPS_DOWNSCALE_1x32
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+
+			// c[0,0-15]
+			BF16_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			BF16_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+		}
+		else 
+		{
+			// c[0,0-15]
+			F32_F32_BETA_OP( c_float_0p0, 0, 0, 0, selector1, selector2 );
+
+			// c[0, 16-31]
+			F32_F32_BETA_OP( c_float_0p1, 0, 0, 1, selector1, selector2 );
+		}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x32:
+	{
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x32:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x32:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x32:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x32:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x32:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x32:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x32_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+	}
+}
+
+// 5x48 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_5x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x48_DISABLE,
+						  &&POST_OPS_BIAS_5x48,
+						  &&POST_OPS_RELU_5x48,
+						  &&POST_OPS_RELU_SCALE_5x48,
+						  &&POST_OPS_GELU_TANH_5x48,
+						  &&POST_OPS_GELU_ERF_5x48,
+						  &&POST_OPS_CLIP_5x48,
+						  &&POST_OPS_DOWNSCALE_5x48
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+    // B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+	__m512 c_float_1p2 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+	__m512 c_float_2p2 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+	__m512 c_float_3p1 = _mm512_setzero_ps();
+	__m512 c_float_3p2 = _mm512_setzero_ps();
+
+	__m512 c_float_4p0 = _mm512_setzero_ps();
+	__m512 c_float_4p1 = _mm512_setzero_ps();
+	__m512 c_float_4p2 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+		c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
+
+		// Broadcast a[4,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+		c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+		c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+		c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+		c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
+
+		c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+		c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
+		c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+				// c[2,0-15]
+				BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+				// c[3,0-15]
+				BF16_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2)
+
+				// c[3,16-31]
+				BF16_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2)
+
+				// c[3,32-47]
+				BF16_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2)
+
+				// c[4,0-15]
+				BF16_F32_BETA_OP(c_float_4p0,0,4,0,selector1,selector2)
+
+				// c[4,16-31]
+				BF16_F32_BETA_OP(c_float_4p1,0,4,1,selector1,selector2)
+
+				// c[4,32-47]
+				BF16_F32_BETA_OP(c_float_4p2,0,4,2,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+				// c[2,0-15]
+				F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+				// c[3,0-15]
+				F32_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2)
+
+				// c[3,16-31]
+				F32_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2)
+
+				// c[3,32-47]
+				F32_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2)
+
+				// c[4,0-15]
+				F32_F32_BETA_OP(c_float_4p0,0,4,0,selector1,selector2)
+
+				// c[4,16-31]
+				F32_F32_BETA_OP(c_float_4p1,0,4,1,selector1,selector2)
+
+				// c[4,32-47]
+				F32_F32_BETA_OP(c_float_4p2,0,4,2,selector1,selector2)
+			}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x48:
+	{
+		__m512 selector3;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+
+			// c[4, 16-31]
+			c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
+
+			// c[4,32-47]
+			c_float_4p2 = _mm512_add_ps( selector3, c_float_4p2 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+			__m512 selector4 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 3 ) );
+			__m512 selector5 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 4 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+
+			// c[4, 16-31]
+			c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
+
+			// c[4,32-47]
+			c_float_4p2 = _mm512_add_ps( selector5, c_float_4p2 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x48:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[1,32-47]
+		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		// c[2,32-47]
+		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
+
+		// c[3,0-15]
+		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+		// c[3,16-31]
+		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+		// c[3,32-47]
+		c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
+
+		// c[4,0-15]
+		c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+		// c[4,16-31]
+		c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
+
+		// c[4,32-47]
+		c_float_4p2 = _mm512_max_ps( selector1, c_float_4p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x48:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p2)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p1)
+
+		// c[4, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_4p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x48:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_F32_AVX512(c_float_2p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_F32_AVX512(c_float_3p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_F32_AVX512(c_float_4p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 32-47]
+		GELU_TANH_F32_AVX512(c_float_4p2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x48:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_F32_AVX512(c_float_2p2, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_F32_AVX512(c_float_3p2, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_F32_AVX512(c_float_4p1, r, x, x_erf)
+
+		// c[4, 32-47]
+		GELU_ERF_F32_AVX512(c_float_4p2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x48:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_F32_AVX512(c_float_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_F32_AVX512(c_float_2p2, min, max)
+
+		// c[3, 0-15]
+		CLIP_F32_AVX512(c_float_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_F32_AVX512(c_float_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_F32_AVX512(c_float_3p2, min, max)
+
+		// c[4, 0-15]
+		CLIP_F32_AVX512(c_float_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_F32_AVX512(c_float_4p1, min, max)
+
+		// c[4, 32-47]
+		CLIP_F32_AVX512(c_float_4p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x48:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		MULRND_F32(c_float_2p2,2,2);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		MULRND_F32(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		MULRND_F32(c_float_3p2,3,2);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		// c[4, 16-31]
+		MULRND_F32(c_float_4p1,4,1);
+
+		// c[4, 32-47]
+		MULRND_F32(c_float_4p2,4,2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x48_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+		
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+		// c[2, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2);
+
+		// c[3, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2);
+
+		// c[4, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+
+		// c[4, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1);
+
+		// c[4, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_4p2,4,2);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
+
+		// c[3,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 );
+
+		// c[4,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 0*16 ), c_float_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 1*16 ), c_float_4p1 );
+
+		// c[4,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 4 ) + ( 2*16 ), c_float_4p2 );
+	}
+}
+
+// 4x48 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_4x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x48_DISABLE,
+						  &&POST_OPS_BIAS_4x48,
+						  &&POST_OPS_RELU_4x48,
+						  &&POST_OPS_RELU_SCALE_4x48,
+						  &&POST_OPS_GELU_TANH_4x48,
+						  &&POST_OPS_GELU_ERF_4x48,
+						  &&POST_OPS_CLIP_4x48,
+						  &&POST_OPS_DOWNSCALE_4x48
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+	__m512 c_float_1p2 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+	__m512 c_float_2p2 = _mm512_setzero_ps();
+
+	__m512 c_float_3p0 = _mm512_setzero_ps();
+	__m512 c_float_3p1 = _mm512_setzero_ps();
+	__m512 c_float_3p2 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+
+		// Broadcast a[3,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+		c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+		c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+		c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
+
+		c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+		c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+		c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+				// c[2,0-15]
+				BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+				// c[3,0-15]
+				BF16_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2)
+
+				// c[3,16-31]
+				BF16_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2)
+
+				// c[3,32-47]
+				BF16_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+				// c[2,0-15]
+				F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+
+				// c[3,0-15]
+				F32_F32_BETA_OP(c_float_3p0,0,3,0,selector1,selector2)
+
+				// c[3,16-31]
+				F32_F32_BETA_OP(c_float_3p1,0,3,1,selector1,selector2)
+
+				// c[3,32-47]
+				F32_F32_BETA_OP(c_float_3p2,0,3,2,selector1,selector2)
+			}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x48:
+	{
+		__m512 selector3;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+			__m512 selector4 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 3 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+			// c[3, 16-31]
+			c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x48:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[1,32-47]
+		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		// c[2,32-47]
+		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
+
+		// c[3,0-15]
+		c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+		// c[3,16-31]
+		c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+		// c[3,32-47]
+		c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x48:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_3p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x48:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_F32_AVX512(c_float_2p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_F32_AVX512(c_float_3p2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x48:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_F32_AVX512(c_float_2p2, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_F32_AVX512(c_float_3p2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x48:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_F32_AVX512(c_float_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_F32_AVX512(c_float_2p2, min, max)
+
+		// c[3, 0-15]
+		CLIP_F32_AVX512(c_float_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_F32_AVX512(c_float_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_F32_AVX512(c_float_3p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x48:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		MULRND_F32(c_float_2p2,2,2);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		MULRND_F32(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		MULRND_F32(c_float_3p2,3,2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x48_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+		// c[2, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2);
+
+		// c[3, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
+
+		// c[3,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 0*16 ), c_float_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 1*16 ), c_float_3p1 );
+
+	// c[3,32-47]
+	_mm512_storeu_ps( c + ( rs_c * 3 ) + ( 2*16 ), c_float_3p2 );
+	}
+}
+
+// 3x48 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_3x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x48_DISABLE,
+						  &&POST_OPS_BIAS_3x48,
+						  &&POST_OPS_RELU_3x48,
+						  &&POST_OPS_RELU_SCALE_3x48,
+						  &&POST_OPS_GELU_TANH_3x48,
+						  &&POST_OPS_GELU_ERF_3x48,
+						  &&POST_OPS_CLIP_3x48,
+						  &&POST_OPS_DOWNSCALE_3x48
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+	__m512 c_float_1p2 = _mm512_setzero_ps();
+
+	__m512 c_float_2p0 = _mm512_setzero_ps();
+	__m512 c_float_2p1 = _mm512_setzero_ps();
+	__m512 c_float_2p2 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+
+		// Broadcast a[2,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+		c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+		c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+
+		c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+		c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+		c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+				// c[2,0-15]
+				BF16_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				BF16_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				BF16_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+
+				// c[2,0-15]
+				F32_F32_BETA_OP(c_float_2p0,0,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				F32_F32_BETA_OP(c_float_2p1,0,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				F32_F32_BETA_OP(c_float_2p2,0,2,2,selector1,selector2)
+			}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x48:
+	{
+		__m512 selector3;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+			selector3 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 2 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+			// c[2, 16-31]
+			c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x48:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[1,32-47]
+		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+		// c[2,0-15]
+		c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+		// c[2,16-31]
+		c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+		// c[2,32-47]
+		c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x48:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_2p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x48:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_F32_AVX512(c_float_2p2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x48:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_F32_AVX512(c_float_2p2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x48:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_F32_AVX512(c_float_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_F32_AVX512(c_float_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_F32_AVX512(c_float_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_F32_AVX512(c_float_2p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x48:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		MULRND_F32(c_float_2p2,2,2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x48_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+		// c[2, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 0*16 ), c_float_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 1*16 ), c_float_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 2 ) + ( 2*16 ), c_float_2p2 );
+	}
+}
+
+// 2x48 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_2x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x48_DISABLE,
+						  &&POST_OPS_BIAS_2x48,
+						  &&POST_OPS_RELU_2x48,
+						  &&POST_OPS_RELU_SCALE_2x48,
+						  &&POST_OPS_GELU_TANH_2x48,
+						  &&POST_OPS_GELU_ERF_2x48,
+						  &&POST_OPS_CLIP_2x48,
+						  &&POST_OPS_DOWNSCALE_2x48
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+
+	__m512 c_float_1p0 = _mm512_setzero_ps();
+	__m512 c_float_1p1 = _mm512_setzero_ps();
+	__m512 c_float_1p2 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+		// Broadcast a[1,kr:kr+2].
+		a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+		c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+		c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+
+		c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+		c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+		c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				BF16_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				BF16_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				F32_F32_BETA_OP(c_float_1p0,0,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				F32_F32_BETA_OP(c_float_1p1,0,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				F32_F32_BETA_OP(c_float_1p2,0,1,2,selector1,selector2)
+			}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x48:
+	{
+		__m512 selector3;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 1 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+			// c[1, 16-31]
+			c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x48:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		// c[1,0-15]
+		c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+		// c[1,16-31]
+		c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+		// c[1,32-47]
+		c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x48:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x48:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x48:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x48:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_F32_AVX512(c_float_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_F32_AVX512(c_float_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_F32_AVX512(c_float_1p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x48:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x48_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 0*16 ), c_float_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 1*16 ), c_float_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 1 ) + ( 2*16 ), c_float_1p2 );
+	}
+}
+
+// 1x48 bf16 kernel
+LPGEMM_MN_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_1x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x48_DISABLE,
+						  &&POST_OPS_BIAS_1x48,
+						  &&POST_OPS_RELU_1x48,
+						  &&POST_OPS_RELU_SCALE_1x48,
+						  &&POST_OPS_GELU_TANH_1x48,
+						  &&POST_OPS_GELU_ERF_1x48,
+						  &&POST_OPS_CLIP_1x48,
+						  &&POST_OPS_DOWNSCALE_1x48
+						};
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// Registers to use for accumulating C.
+	__m512 c_float_0p0 = _mm512_setzero_ps();
+	__m512 c_float_0p1 = _mm512_setzero_ps();
+	__m512 c_float_0p2 = _mm512_setzero_ps();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_bf16_0 = (__m512bh)_mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+2].
+		a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+		a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 2.
+		// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+		c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+		c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+		c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512 selector1 = _mm512_set1_ps( alpha );
+	__m512 selector2 = _mm512_set1_ps( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+		c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+		c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				BF16_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				BF16_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				BF16_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP(c_float_0p0,0,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				F32_F32_BETA_OP(c_float_0p1,0,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				F32_F32_BETA_OP(c_float_0p2,0,0,2,selector1,selector2)
+			}
+	}
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x48:
+	{
+		__m512 selector3;
+
+		if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+			 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+		{
+			selector1 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			selector3 =
+				_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+		}
+		else
+		{
+			selector1 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+							+ post_ops_attr.post_op_c_i + 0 ) );
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+		}
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x48:
+	{
+		selector1 = _mm512_setzero_ps();
+
+		// c[0,0-15]
+		c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+		// c[0, 16-31]
+		c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+		// c[0,32-47]
+		c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x48:
+	{
+		selector1 = _mm512_setzero_ps();
+		selector2 =
+			_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x48:
+	{
+		__m512 dn, z, x, r2, r, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x48:
+	{
+		__m512 x, r, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x48:
+	{
+		__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+		__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+		
+		// c[0, 0-15]
+		CLIP_F32_AVX512(c_float_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_F32_AVX512(c_float_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_F32_AVX512(c_float_0p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x48:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x48_DISABLE:
+	;
+	if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		__m512i selector_a = _mm512_setzero_epi32();
+		__m512i selector_b = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+		// Store the results in downscaled type (bf16 instead of float).
+
+		// c[0, 0-15]
+		CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+	}
+
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 0*16 ), c_float_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 1*16 ), c_float_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_ps( c + ( rs_c * 0 ) + ( 2*16 ), c_float_0p2 );
+	}
+}
+#endif
+#endif
diff --git a/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c
new file mode 100644
index 0000000000..c95c0090ae
--- /dev/null
+++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_n_fringe_bf16_amd512vnni.c
@@ -0,0 +1,3461 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_f32_kern_macros.h"
+
+#ifndef LPGEMM_BF16_NOT_SUPPORTED
+// 6xlt16 bf16 fringe kernel
+LPGEMM_N_LT_NR0_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6xLT16_DISABLE,
+						  &&POST_OPS_BIAS_6xLT16,
+						  &&POST_OPS_RELU_6xLT16,
+						  &&POST_OPS_RELU_SCALE_6xLT16,
+						  &&POST_OPS_GELU_TANH_6xLT16,
+						  &&POST_OPS_GELU_ERF_6xLT16,
+						  &&POST_OPS_CLIP_6xLT16,
+						  &&POST_OPS_DOWNSCALE_6xLT16
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+    // B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	// For corner cases.
+	float buf0[16];
+
+	dim_t value;
+
+	if(k_full_pieces > 40)
+	{
+		value = 40;
+	}
+	else
+	{
+		value = 0;
+	}
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512 c_float_0p0 = _mm512_setzero_ps();
+
+		__m512 c_float_1p0 = _mm512_setzero_ps();
+
+		__m512 c_float_2p0 = _mm512_setzero_ps();
+
+		__m512 c_float_3p0 = _mm512_setzero_ps();
+
+		__m512 c_float_4p0 = _mm512_setzero_ps();
+
+		__m512 c_float_5p0 = _mm512_setzero_ps();
+
+		for ( dim_t kr = 0; kr < k_full_pieces - value; kr += 1 )
+		{
+			// Load 2 rows with 16 extended elements each from B to 1 ZMM
+			// registers. It is to be noted that the B matrix is packed for use
+			// in bf16 instructions and each load to ZMM register will have 2
+			// elements along k direction and 16 elements across n directions,
+			// so 2x16 elements to a ZMM register.
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
+		}
+
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (0 * 16), _MM_HINT_T1);
+
+		for (dim_t kr = k_full_pieces - value; kr < k_full_pieces; kr += 1)
+		{
+			// Load 2 rows with 16 extended elements each from B to 1 ZMM
+			// registers. It is to be noted that the B matrix is packed for use
+			// in bf16 instructions and each load to ZMM register will have 2
+			// elements along k direction and 16 elements across n directions,
+			// so 2x16 elements to a ZMM register.
+			b0 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 0));
+
+			// Broadcast a[0,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 0) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_0p0 = _mm512_dpbf16_ps(c_float_0p0, a_bf16_0, b0);
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 1) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_1p0 = _mm512_dpbf16_ps(c_float_1p0, a_bf16_0, b0);
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 2) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_2p0 = _mm512_dpbf16_ps(c_float_2p0, a_bf16_0, b0);
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 3) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_3p0 = _mm512_dpbf16_ps(c_float_3p0, a_bf16_0, b0);
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 4) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_4p0 = _mm512_dpbf16_ps(c_float_4p0, a_bf16_0, b0);
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 5) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_5p0 = _mm512_dpbf16_ps(c_float_5p0, a_bf16_0, b0);
+		}
+
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+2].
+			a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
+		}
+
+		// Load alpha and beta
+		__m512 selector1 = _mm512_set1_ps( alpha );
+		__m512 selector2 = _mm512_set1_ps( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+			c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+
+			c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+
+			c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+
+			c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+
+			c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_2p0, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_3p0, 3, 0, \
+								selector1, selector2 );
+
+				// c[4,0-15]
+				BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_4p0, 4, 0, \
+								selector1, selector2 );
+
+				// c[5,0-15]
+				BF16_F32_BETA_OP_NLT16F_MASK( load_mask, c_float_5p0, 5, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_0p0, ir, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_1p0, ir, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_2p0, ir, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_3p0, ir, 3, 0, \
+								selector1, selector2);
+
+				// c[4,0-15]
+				F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_4p0, ir, 4, 0, \
+								selector1, selector2);
+
+				// c[5,0-15]
+				F32_F32_BETA_OP_NLT16F_MASK(load_mask, c_float_5p0, ir, 5, 0, \
+								selector1, selector2);
+			}
+		}
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6xLT16:
+		{
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				memcpy( buf0, ( ( float* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j ), ( n0_rem * sizeof( float ) ) );
+				selector1 = _mm512_loadu_ps( buf0 );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 1 ) );
+				__m512 selector3 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 2 ) );
+				__m512 selector4 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 3 ) );
+				__m512 selector5 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 4 ) );
+				__m512 selector6 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 5 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+			// c[5,0-15]
+			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6xLT16:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6xLT16:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+		    __m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_F32_AVX512(c_float_5p0, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6xLT16:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_F32_AVX512(c_float_5p0, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6xLT16:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+			
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_F32_AVX512(c_float_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_F32_AVX512(c_float_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_F32_AVX512(c_float_4p0, min, max)
+
+			// c[5, 0-15]
+			CLIP_F32_AVX512(c_float_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6xLT16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		// c[5, 0-15]
+		MULRND_F32(c_float_5p0,5,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6xLT16_DISABLE:
+		;
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+
+			// c[5,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0);
+		}
+
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_ps( c + ( rs_c * ( ir + 0 ) ), load_mask, c_float_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_ps( c + ( rs_c * ( ir + 1 ) ), load_mask, c_float_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_ps( c + ( rs_c * ( ir + 2 ) ), load_mask, c_float_2p0 );
+
+			// c[3,0-15]
+			_mm512_mask_storeu_ps( c + ( rs_c * ( ir + 3 ) ), load_mask, c_float_3p0 );
+
+			// c[4,0-15]
+			_mm512_mask_storeu_ps( c + ( rs_c * ( ir + 4 ) ), load_mask, c_float_4p0 );
+
+			// c[5,0-15]
+			_mm512_mask_storeu_ps( c + ( rs_c * ( ir + 5 ) ), load_mask, c_float_5p0 );
+
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_bf16bf16f32of32_5xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_bf16bf16f32of32_4xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_bf16bf16f32of32_3xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_bf16bf16f32of32_2xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_bf16bf16f32of32_1xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+// 6x16 bf16 fringe kernel
+LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x16_DISABLE,
+						  &&POST_OPS_BIAS_6x16,
+						  &&POST_OPS_RELU_6x16,
+						  &&POST_OPS_RELU_SCALE_6x16,
+						  &&POST_OPS_GELU_TANH_6x16,
+						  &&POST_OPS_GELU_ERF_6x16,
+						  &&POST_OPS_CLIP_6x16,
+						  &&POST_OPS_DOWNSCALE_6x16
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	dim_t value;
+
+	if(k_full_pieces > 40)
+	{
+		value = 40;
+	}
+	else
+	{
+		value = 0;
+	}
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512 c_float_0p0 = _mm512_setzero_ps();
+
+		__m512 c_float_1p0 = _mm512_setzero_ps();
+
+		__m512 c_float_2p0 = _mm512_setzero_ps();
+
+		__m512 c_float_3p0 = _mm512_setzero_ps();
+
+		__m512 c_float_4p0 = _mm512_setzero_ps();
+
+		__m512 c_float_5p0 = _mm512_setzero_ps();
+
+		for ( dim_t kr = 0; kr < k_full_pieces - value; kr += 1 )
+		{
+			// Load 2 rows with 16 elements each from B to 1 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in bf16
+			// instructions and each load to ZMM register will have 2 elements
+			// along k direction and 16 elements across n directions, so 2x16
+			// elements to a ZMM register.
+		    b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
+		}
+
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (0 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (0 * 16), _MM_HINT_T1);
+
+		for (dim_t kr = k_full_pieces - value; kr < k_full_pieces; kr += 1)
+		{
+			// Load 2 rows with 16 elements each from B to 1 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in bf16
+			// instructions and each load to ZMM register will have 2 elements
+			// along k direction and 16 elements across n directions, so 2x16
+			// elements to a ZMM register.
+			b0 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 0));
+
+			// Broadcast a[0,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 0) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_0p0 = _mm512_dpbf16_ps(c_float_0p0, a_bf16_0, b0);
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 1) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_1p0 = _mm512_dpbf16_ps(c_float_1p0, a_bf16_0, b0);
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 2) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_2p0 = _mm512_dpbf16_ps(c_float_2p0, a_bf16_0, b0);
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 3) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_3p0 = _mm512_dpbf16_ps(c_float_3p0, a_bf16_0, b0);
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 4) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_4p0 = _mm512_dpbf16_ps(c_float_4p0, a_bf16_0, b0);
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 5) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_5p0 = _mm512_dpbf16_ps(c_float_5p0, a_bf16_0, b0);
+		}
+
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+2].
+			a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-15] = a[0,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-15] = a[1,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-15] = a[2,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-15] = a[3,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-15] = a[4,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-15] = a[5,kr:kr+2]*b[kr:kr+2,0-15]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
+		}
+
+		// Load alpha and beta
+		__m512 selector1 = _mm512_set1_ps( alpha );
+		__m512 selector2 = _mm512_set1_ps( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+
+			c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+
+			c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+
+			c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+
+			c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+
+			c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+
+				// c[0,0-15]
+				BF16_F32_BETA_OP( c_float_0p0, ir, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP( c_float_1p0, ir, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				BF16_F32_BETA_OP( c_float_2p0, ir, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				BF16_F32_BETA_OP( c_float_3p0, ir, 3, 0, \
+								selector1, selector2 );
+
+				// c[4,0-15]
+				BF16_F32_BETA_OP( c_float_4p0, ir, 4, 0, \
+								selector1, selector2 );
+
+				// c[5,0-15]
+				BF16_F32_BETA_OP( c_float_5p0, ir, 5, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP(c_float_0p0, ir, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				F32_F32_BETA_OP(c_float_1p0, ir, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				F32_F32_BETA_OP(c_float_2p0, ir, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				F32_F32_BETA_OP(c_float_3p0, ir, 3, 0, \
+								selector1, selector2);
+
+				// c[4,0-15]
+				F32_F32_BETA_OP(c_float_4p0, ir, 4, 0, \
+								selector1, selector2);
+
+				// c[5,0-15]
+				F32_F32_BETA_OP(c_float_5p0, ir, 5, 0, \
+								selector1, selector2);
+			}
+		}
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x16:
+		{
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				selector1 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 1 ) );
+				__m512 selector3 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 2 ) );
+				__m512 selector4 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 3 ) );
+				__m512 selector5 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 4 ) );
+				__m512 selector6 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 5 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x16:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+			// c[5,0-15]
+			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x16:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x16:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_F32_AVX512(c_float_5p0, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x16:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_F32_AVX512(c_float_5p0, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x16:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_F32_AVX512(c_float_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_F32_AVX512(c_float_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_F32_AVX512(c_float_4p0, min, max)
+
+			// c[5, 0-15]
+			CLIP_F32_AVX512(c_float_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x16:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		// c[5, 0-15]
+		MULRND_F32(c_float_5p0,5,0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x16_DISABLE:
+		;
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			__m512i selector_a = _mm512_setzero_epi32();
+			__m512i selector_b = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+
+			// c[5,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0);
+		}
+
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 );
+
+			// c[1,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 );
+
+			// c[2,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 );
+
+			// c[3,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 );
+
+			// c[4,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 );
+
+			// c[5,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_bf16bf16f32of32_5x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_bf16bf16f32of32_4x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			int cs_a_use = ( cs_a == 2) ? 2 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_bf16bf16f32of32_3x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_bf16bf16f32of32_2x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_bf16bf16f32of32_1x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+// 6x32 bf16 fringe kernel
+LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x32_DISABLE,
+						  &&POST_OPS_BIAS_6x32,
+						  &&POST_OPS_RELU_6x32,
+						  &&POST_OPS_RELU_SCALE_6x32,
+						  &&POST_OPS_GELU_TANH_6x32,
+						  &&POST_OPS_GELU_ERF_6x32,
+						  &&POST_OPS_CLIP_6x32,
+						  &&POST_OPS_DOWNSCALE_6x32
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	dim_t value;
+
+	if(k_full_pieces > 40)
+	{
+		value = 40;
+	}
+	else
+	{
+		value = 0;
+	}
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512 c_float_0p0 = _mm512_setzero_ps();
+		__m512 c_float_0p1 = _mm512_setzero_ps();
+
+		__m512 c_float_1p0 = _mm512_setzero_ps();
+		__m512 c_float_1p1 = _mm512_setzero_ps();
+
+		__m512 c_float_2p0 = _mm512_setzero_ps();
+		__m512 c_float_2p1 = _mm512_setzero_ps();
+
+		__m512 c_float_3p0 = _mm512_setzero_ps();
+		__m512 c_float_3p1 = _mm512_setzero_ps();
+
+		__m512 c_float_4p0 = _mm512_setzero_ps();
+		__m512 c_float_4p1 = _mm512_setzero_ps();
+
+		__m512 c_float_5p0 = _mm512_setzero_ps();
+		__m512 c_float_5p1 = _mm512_setzero_ps();
+
+		for ( dim_t kr = 0; kr < k_full_pieces - value; kr += 1 )
+		{
+			// Load 2 rows with 32 elements each from B to 2 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in bf16
+			// instructions and each load to ZMM register will have 2 elements
+			// along k direction and 32 elements across n directions, so 2x16
+			// elements to a ZMM register.
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+			// Broadcast a[0,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-31] = a[5,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
+			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_0, b1 );
+		}
+
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (1 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (1 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (1 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (1 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (1 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (1 * 16), _MM_HINT_T1);
+
+		for (dim_t kr = k_full_pieces - value; kr < k_full_pieces; kr += 1)
+		{
+			// Load 2 rows with 32 elements each from B to 2 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in bf16
+			// instructions and each load to ZMM register will have 2 elements
+			// along k direction and 32 elements across n directions, so 2x16
+			// elements to a ZMM register.
+			b0 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 0));
+			b1 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 1));
+
+			// Broadcast a[0,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 0) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_0p0 = _mm512_dpbf16_ps(c_float_0p0, a_bf16_0, b0);
+			c_float_0p1 = _mm512_dpbf16_ps(c_float_0p1, a_bf16_0, b1);
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 1) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_1p0 = _mm512_dpbf16_ps(c_float_1p0, a_bf16_0, b0);
+			c_float_1p1 = _mm512_dpbf16_ps(c_float_1p1, a_bf16_0, b1);
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 2) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_2p0 = _mm512_dpbf16_ps(c_float_2p0, a_bf16_0, b0);
+			c_float_2p1 = _mm512_dpbf16_ps(c_float_2p1, a_bf16_0, b1);
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 3) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_3p0 = _mm512_dpbf16_ps(c_float_3p0, a_bf16_0, b0);
+			c_float_3p1 = _mm512_dpbf16_ps(c_float_3p1, a_bf16_0, b1);
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 4) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_4p0 = _mm512_dpbf16_ps(c_float_4p0, a_bf16_0, b0);
+			c_float_4p1 = _mm512_dpbf16_ps(c_float_4p1, a_bf16_0, b1);
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 5) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-31] = a[5,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_5p0 = _mm512_dpbf16_ps(c_float_5p0, a_bf16_0, b0);
+			c_float_5p1 = _mm512_dpbf16_ps(c_float_5p1, a_bf16_0, b1);
+		}
+
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+			// Broadcast a[0,kr:kr+2].
+			a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-31] = a[0,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-31] = a[1,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-31] = a[2,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-31] = a[3,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-31] = a[4,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-31] = a[5,kr:kr+2]*b[kr:kr+2,0-31]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
+			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_0, b1 );
+		}
+		// Load alpha and beta
+		__m512 selector1 = _mm512_set1_ps( alpha );
+		__m512 selector2 = _mm512_set1_ps( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+			c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+
+			c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+			c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+
+			c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+			c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+
+			c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+			c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+
+			c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+			c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
+
+			c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
+			c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+
+				// c[0,0-15]
+				BF16_F32_BETA_OP( c_float_0p0, ir, 0, 0, selector1, selector2 );
+
+				// c[0, 16-31]
+				BF16_F32_BETA_OP( c_float_0p1, ir, 0, 1, selector1, selector2 );
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP( c_float_1p0, ir, 1, 0, selector1, selector2 );
+
+				// c[1, 16-31]
+				BF16_F32_BETA_OP( c_float_1p1, ir, 1, 1, selector1, selector2 );
+
+				// c[2,0-15]
+				BF16_F32_BETA_OP( c_float_2p0, ir, 2, 0, selector1, selector2 );
+
+				// c[2, 16-31]
+				BF16_F32_BETA_OP( c_float_2p1, ir, 2, 1, selector1, selector2 );
+
+				// c[3,0-15]
+				BF16_F32_BETA_OP( c_float_3p0, ir, 3, 0, selector1, selector2 );
+
+				// c[3, 16-31]
+				BF16_F32_BETA_OP( c_float_3p1, ir, 3, 1, selector1, selector2 );
+
+				// c[4,0-15]
+				BF16_F32_BETA_OP( c_float_4p0, ir, 4, 0, selector1, selector2 );
+
+				// c[4, 16-31]
+				BF16_F32_BETA_OP( c_float_4p1, ir, 4, 1, selector1, selector2 );
+
+				// c[5,0-15]
+				BF16_F32_BETA_OP( c_float_5p0, ir, 5, 0, selector1, selector2 );
+
+				// c[5, 16-31]
+				BF16_F32_BETA_OP( c_float_5p1, ir, 5, 1, selector1, selector2 );
+			}
+			else 
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP( c_float_0p0, ir, 0, 0, selector1, selector2 );
+
+				// c[0, 16-31]
+				F32_F32_BETA_OP( c_float_0p1, ir, 0, 1, selector1, selector2 );
+
+				// c[1,0-15]
+				F32_F32_BETA_OP( c_float_1p0, ir, 1, 0, selector1, selector2 );
+
+				// c[1, 16-31]
+				F32_F32_BETA_OP( c_float_1p1, ir, 1, 1, selector1, selector2 );
+
+				// c[2,0-15]
+				F32_F32_BETA_OP( c_float_2p0, ir, 2, 0, selector1, selector2 );
+
+				// c[2, 16-31]
+				F32_F32_BETA_OP( c_float_2p1, ir, 2, 1, selector1, selector2 );
+
+				// c[3,0-15]
+				F32_F32_BETA_OP( c_float_3p0, ir, 3, 0, selector1, selector2 );
+
+				// c[3, 16-31]
+				F32_F32_BETA_OP( c_float_3p1, ir, 3, 1, selector1, selector2 );
+
+				// c[4,0-15]
+				F32_F32_BETA_OP( c_float_4p0, ir, 4, 0, selector1, selector2 );
+
+				// c[4, 16-31]
+				F32_F32_BETA_OP( c_float_4p1, ir, 4, 1, selector1, selector2 );
+
+				// c[5,0-15]
+				F32_F32_BETA_OP( c_float_5p0, ir, 5, 0, selector1, selector2 );
+
+				// c[5, 16-31]
+				F32_F32_BETA_OP( c_float_5p1, ir, 5, 1, selector1, selector2 );
+			}
+			
+		}
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x32:
+		{
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				selector1 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+				selector2 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[0, 16-31]
+				c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+				// c[1, 16-31]
+				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+				// c[2, 16-31]
+				c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+				// c[3, 16-31]
+				c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+
+				// c[4, 16-31]
+				c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
+
+				// c[5, 16-31]
+				c_float_5p1 = _mm512_add_ps( selector2, c_float_5p1 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 1 ) );
+				__m512 selector3 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 2 ) );
+				__m512 selector4 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 3 ) );
+				__m512 selector5 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 4 ) );
+				__m512 selector6 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 5 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[0, 16-31]
+				c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+				// c[1, 16-31]
+				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+				// c[2, 16-31]
+				c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+				// c[3, 16-31]
+				c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+
+				// c[4, 16-31]
+				c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
+
+				// c[5, 16-31]
+				c_float_5p1 = _mm512_add_ps( selector6, c_float_5p1 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x32:
+		{
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			// c[1,16-31]
+			c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+			// c[2,16-31]
+			c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+			// c[3,16-31]
+			c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+			// c[4,16-31]
+			c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
+
+			// c[5,0-15]
+			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
+
+			// c[5,16-31]
+			c_float_5p1 = _mm512_max_ps( selector1, c_float_5p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x32:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p1)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p1)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x32:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_F32_AVX512(c_float_4p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_F32_AVX512(c_float_5p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_F32_AVX512(c_float_5p1, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x32:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_F32_AVX512(c_float_4p1, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_F32_AVX512(c_float_5p0, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_F32_AVX512(c_float_5p1, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x32:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_F32_AVX512(c_float_0p1, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_F32_AVX512(c_float_1p1, min, max)
+
+			// c[2, 0-15]
+			CLIP_F32_AVX512(c_float_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_F32_AVX512(c_float_2p1, min, max)
+
+			// c[3, 0-15]
+			CLIP_F32_AVX512(c_float_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_F32_AVX512(c_float_3p1, min, max)
+
+			// c[4, 0-15]
+			CLIP_F32_AVX512(c_float_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_F32_AVX512(c_float_4p1, min, max)
+
+			// c[5, 0-15]
+			CLIP_F32_AVX512(c_float_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_F32_AVX512(c_float_5p1, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x32:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		MULRND_F32(c_float_3p1,3,1);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		// c[4, 16-31]
+		MULRND_F32(c_float_4p1,4,1);
+
+		// c[5, 0-15]
+		MULRND_F32(c_float_5p0,5,0);
+
+		// c[5, 16-31]
+		MULRND_F32(c_float_5p1,5,1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x32_DISABLE:
+		;
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			__m512i selector_a = _mm512_setzero_epi32();
+			__m512i selector_b = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+			// c[0, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+			// c[1,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+			// c[1, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+			// c[2,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+			// c[2, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+			// c[3,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+			// c[3, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+
+			// c[4,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+
+			// c[4, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1);
+
+			// c[5,0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0);
+
+			// c[5, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_5p1,5,1);
+		}
+
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 );
+
+			// c[1,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 );
+
+			// c[2,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 );
+
+			// c[3,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 );
+
+			// c[4,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 );
+
+			// c[5,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_bf16bf16f32of32_5x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_bf16bf16f32of32_4x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_bf16bf16f32of32_3x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_bf16bf16f32of32_2x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_bf16bf16f32of32_1x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+// 6x48 bf16 fringe kernel
+LPGEMM_N_FRINGE_KERN(bfloat16, bfloat16, float, bf16bf16f32of32_6x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x48_DISABLE,
+						  &&POST_OPS_BIAS_6x48,
+						  &&POST_OPS_RELU_6x48,
+						  &&POST_OPS_RELU_SCALE_6x48,
+						  &&POST_OPS_GELU_TANH_6x48,
+						  &&POST_OPS_GELU_ERF_6x48,
+						  &&POST_OPS_CLIP_6x48,
+						  &&POST_OPS_DOWNSCALE_6x48
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 2;
+	dim_t k_partial_pieces = k0 % 2;
+
+	int16_t a_kfringe_buf = 0;
+
+	// B matrix storage bfloat type
+	__m512bh b0;
+	__m512bh b1;
+	__m512bh b2;
+
+	// A matrix storage bfloat type
+	__m512bh a_bf16_0;
+
+	dim_t value;
+
+	if(k_full_pieces > 40)
+	{
+		value = 40;
+	}
+	else
+	{
+		value = 0;
+	}
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512 c_float_0p0 = _mm512_setzero_ps();
+		__m512 c_float_0p1 = _mm512_setzero_ps();
+		__m512 c_float_0p2 = _mm512_setzero_ps();
+
+		__m512 c_float_1p0 = _mm512_setzero_ps();
+		__m512 c_float_1p1 = _mm512_setzero_ps();
+		__m512 c_float_1p2 = _mm512_setzero_ps();
+
+		__m512 c_float_2p0 = _mm512_setzero_ps();
+		__m512 c_float_2p1 = _mm512_setzero_ps();
+		__m512 c_float_2p2 = _mm512_setzero_ps();
+
+		__m512 c_float_3p0 = _mm512_setzero_ps();
+		__m512 c_float_3p1 = _mm512_setzero_ps();
+		__m512 c_float_3p2 = _mm512_setzero_ps();
+
+		__m512 c_float_4p0 = _mm512_setzero_ps();
+		__m512 c_float_4p1 = _mm512_setzero_ps();
+		__m512 c_float_4p2 = _mm512_setzero_ps();
+
+		__m512 c_float_5p0 = _mm512_setzero_ps();
+		__m512 c_float_5p1 = _mm512_setzero_ps();
+		__m512 c_float_5p2 = _mm512_setzero_ps();
+
+		for ( dim_t kr = 0; kr < k_full_pieces - value; kr += 1 )
+		{
+
+			// Load 2 rows with 48 elements each from B to 3 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in bf16
+			// instructions and each load to ZMM register will have 2 elements
+			// along k direction and 16 elements across n directions, so 2x16
+			// elements to a ZMM register.
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+			b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+			// Broadcast a[0,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+			c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+			c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+			c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+			c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+			c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+					*( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-47] = a[5,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
+			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_0, b1 );
+			c_float_5p2 = _mm512_dpbf16_ps( c_float_5p2, a_bf16_0, b2 );
+
+		}
+
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 0)) + (2 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 1)) + (2 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 2)) + (2 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 3)) + (2 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 4)) + (2 * 16), _MM_HINT_T1);
+
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (0 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (1 * 16), _MM_HINT_T1);
+		_mm_prefetch(c + (rs_c * (ir + 5)) + (2 * 16), _MM_HINT_T1);
+
+		for (dim_t kr = k_full_pieces - value; kr < k_full_pieces; kr += 1)
+		{
+
+			// Load 2 rows with 48 elements each from B to 3 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in bf16
+			// instructions and each load to ZMM register will have 2 elements
+			// along k direction and 16 elements across n directions, so 2x16
+			// elements to a ZMM register.
+			b0 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 0));
+			b1 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 1));
+			b2 = (__m512bh)_mm512_loadu_epi16(b + (rs_b * kr) + (cs_b * 2));
+
+			// Broadcast a[0,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 0) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_0p0 = _mm512_dpbf16_ps(c_float_0p0, a_bf16_0, b0);
+			c_float_0p1 = _mm512_dpbf16_ps(c_float_0p1, a_bf16_0, b1);
+			c_float_0p2 = _mm512_dpbf16_ps(c_float_0p2, a_bf16_0, b2);
+
+			// Broadcast a[1,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 1) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_1p0 = _mm512_dpbf16_ps(c_float_1p0, a_bf16_0, b0);
+			c_float_1p1 = _mm512_dpbf16_ps(c_float_1p1, a_bf16_0, b1);
+			c_float_1p2 = _mm512_dpbf16_ps(c_float_1p2, a_bf16_0, b2);
+
+			// Broadcast a[2,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 2) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_2p0 = _mm512_dpbf16_ps(c_float_2p0, a_bf16_0, b0);
+			c_float_2p1 = _mm512_dpbf16_ps(c_float_2p1, a_bf16_0, b1);
+			c_float_2p2 = _mm512_dpbf16_ps(c_float_2p2, a_bf16_0, b2);
+
+			// Broadcast a[3,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 3) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_3p0 = _mm512_dpbf16_ps(c_float_3p0, a_bf16_0, b0);
+			c_float_3p1 = _mm512_dpbf16_ps(c_float_3p1, a_bf16_0, b1);
+			c_float_3p2 = _mm512_dpbf16_ps(c_float_3p2, a_bf16_0, b2);
+
+			// Broadcast a[4,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 4) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_4p0 = _mm512_dpbf16_ps(c_float_4p0, a_bf16_0, b0);
+			c_float_4p1 = _mm512_dpbf16_ps(c_float_4p1, a_bf16_0, b1);
+			c_float_4p2 = _mm512_dpbf16_ps(c_float_4p2, a_bf16_0, b2);
+
+			// Broadcast a[5,kr:kr+2].
+			a_bf16_0 = (__m512bh)_mm512_set1_epi32(
+				*(int32_t *)(a + (rs_a * 5) + (cs_a * kr)));
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-47] = a[5,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_5p0 = _mm512_dpbf16_ps(c_float_5p0, a_bf16_0, b0);
+			c_float_5p1 = _mm512_dpbf16_ps(c_float_5p1, a_bf16_0, b1);
+			c_float_5p2 = _mm512_dpbf16_ps(c_float_5p2, a_bf16_0, b2);
+		}
+
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			b0 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+			b1 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+			b2 = (__m512bh)_mm512_loadu_epi16( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = *( a + (rs_a * 0) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[0,0-47] = a[0,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_0p0 = _mm512_dpbf16_ps( c_float_0p0, a_bf16_0, b0 );
+			c_float_0p1 = _mm512_dpbf16_ps( c_float_0p1, a_bf16_0, b1 );
+			c_float_0p2 = _mm512_dpbf16_ps( c_float_0p2, a_bf16_0, b2 );
+
+			// Broadcast a[1,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 1) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[1,0-47] = a[1,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_1p0 = _mm512_dpbf16_ps( c_float_1p0, a_bf16_0, b0 );
+			c_float_1p1 = _mm512_dpbf16_ps( c_float_1p1, a_bf16_0, b1 );
+			c_float_1p2 = _mm512_dpbf16_ps( c_float_1p2, a_bf16_0, b2 );
+
+			// Broadcast a[2,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 2) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[2,0-47] = a[2,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_2p0 = _mm512_dpbf16_ps( c_float_2p0, a_bf16_0, b0 );
+			c_float_2p1 = _mm512_dpbf16_ps( c_float_2p1, a_bf16_0, b1 );
+			c_float_2p2 = _mm512_dpbf16_ps( c_float_2p2, a_bf16_0, b2 );
+
+			// Broadcast a[3,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 3) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[3,0-47] = a[3,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_3p0 = _mm512_dpbf16_ps( c_float_3p0, a_bf16_0, b0 );
+			c_float_3p1 = _mm512_dpbf16_ps( c_float_3p1, a_bf16_0, b1 );
+			c_float_3p2 = _mm512_dpbf16_ps( c_float_3p2, a_bf16_0, b2 );
+
+			// Broadcast a[4,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 4) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[4,0-47] = a[4,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_4p0 = _mm512_dpbf16_ps( c_float_4p0, a_bf16_0, b0 );
+			c_float_4p1 = _mm512_dpbf16_ps( c_float_4p1, a_bf16_0, b1 );
+			c_float_4p2 = _mm512_dpbf16_ps( c_float_4p2, a_bf16_0, b2 );
+
+			// Broadcast a[5,kr:kr+2].
+			a_kfringe_buf = *(a + (rs_a * 5) + (cs_a * ( k_full_pieces )));
+			a_bf16_0 = (__m512bh)_mm512_set1_epi16( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 2.
+			// c[5,0-47] = a[5,kr:kr+2]*b[kr:kr+2,0-47]
+			c_float_5p0 = _mm512_dpbf16_ps( c_float_5p0, a_bf16_0, b0 );
+			c_float_5p1 = _mm512_dpbf16_ps( c_float_5p1, a_bf16_0, b1 );
+			c_float_5p2 = _mm512_dpbf16_ps( c_float_5p2, a_bf16_0, b2 );
+		}
+
+		// Load alpha and beta
+		__m512 selector1 = _mm512_set1_ps( alpha );
+		__m512 selector2 = _mm512_set1_ps( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_float_0p0 = _mm512_mul_ps( selector1, c_float_0p0 );
+			c_float_0p1 = _mm512_mul_ps( selector1, c_float_0p1 );
+			c_float_0p2 = _mm512_mul_ps( selector1, c_float_0p2 );
+
+			c_float_1p0 = _mm512_mul_ps( selector1, c_float_1p0 );
+			c_float_1p1 = _mm512_mul_ps( selector1, c_float_1p1 );
+			c_float_1p2 = _mm512_mul_ps( selector1, c_float_1p2 );
+
+			c_float_2p0 = _mm512_mul_ps( selector1, c_float_2p0 );
+			c_float_2p1 = _mm512_mul_ps( selector1, c_float_2p1 );
+			c_float_2p2 = _mm512_mul_ps( selector1, c_float_2p2 );
+
+			c_float_3p0 = _mm512_mul_ps( selector1, c_float_3p0 );
+			c_float_3p1 = _mm512_mul_ps( selector1, c_float_3p1 );
+			c_float_3p2 = _mm512_mul_ps( selector1, c_float_3p2 );
+
+			c_float_4p0 = _mm512_mul_ps( selector1, c_float_4p0 );
+			c_float_4p1 = _mm512_mul_ps( selector1, c_float_4p1 );
+			c_float_4p2 = _mm512_mul_ps( selector1, c_float_4p2 );
+
+			c_float_5p0 = _mm512_mul_ps( selector1, c_float_5p0 );
+			c_float_5p1 = _mm512_mul_ps( selector1, c_float_5p1 );
+			c_float_5p2 = _mm512_mul_ps( selector1, c_float_5p2 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0,0-15]
+				BF16_F32_BETA_OP(c_float_0p0,ir,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				BF16_F32_BETA_OP(c_float_0p1,ir,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				BF16_F32_BETA_OP(c_float_0p2,ir,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				BF16_F32_BETA_OP(c_float_1p0,ir,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				BF16_F32_BETA_OP(c_float_1p1,ir,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				BF16_F32_BETA_OP(c_float_1p2,ir,1,2,selector1,selector2)
+
+				// c[2,0-15]
+				BF16_F32_BETA_OP(c_float_2p0,ir,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				BF16_F32_BETA_OP(c_float_2p1,ir,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				BF16_F32_BETA_OP(c_float_2p2,ir,2,2,selector1,selector2)
+
+				// c[3,0-15]
+				BF16_F32_BETA_OP(c_float_3p0,ir,3,0,selector1,selector2)
+
+				// c[3,16-31]
+				BF16_F32_BETA_OP(c_float_3p1,ir,3,1,selector1,selector2)
+
+				// c[3,32-47]
+				BF16_F32_BETA_OP(c_float_3p2,ir,3,2,selector1,selector2)
+
+				// c[4,0-15]
+				BF16_F32_BETA_OP(c_float_4p0,ir,4,0,selector1,selector2)
+
+				// c[4,16-31]
+				BF16_F32_BETA_OP(c_float_4p1,ir,4,1,selector1,selector2)
+
+				// c[4,32-47]
+				BF16_F32_BETA_OP(c_float_4p2,ir,4,2,selector1,selector2)
+
+				// c[5,0-15]
+				BF16_F32_BETA_OP(c_float_5p0,ir,5,0,selector1,selector2)
+
+				// c[5,16-31]
+				BF16_F32_BETA_OP(c_float_5p1,ir,5,1,selector1,selector2)
+
+				// c[5,32-47]
+				BF16_F32_BETA_OP(c_float_5p2,ir,5,2,selector1,selector2)
+			}
+			else
+			{
+				// c[0,0-15]
+				F32_F32_BETA_OP(c_float_0p0,ir,0,0,selector1,selector2)
+
+				// c[0, 16-31]
+				F32_F32_BETA_OP(c_float_0p1,ir,0,1,selector1,selector2)
+
+				// c[0,32-47]
+				F32_F32_BETA_OP(c_float_0p2,ir,0,2,selector1,selector2)
+
+				// c[1,0-15]
+				F32_F32_BETA_OP(c_float_1p0,ir,1,0,selector1,selector2)
+
+				// c[1,16-31]
+				F32_F32_BETA_OP(c_float_1p1,ir,1,1,selector1,selector2)
+
+				// c[1,32-47]
+				F32_F32_BETA_OP(c_float_1p2,ir,1,2,selector1,selector2)
+
+				// c[2,0-15]
+				F32_F32_BETA_OP(c_float_2p0,ir,2,0,selector1,selector2)
+
+				// c[2,16-31]
+				F32_F32_BETA_OP(c_float_2p1,ir,2,1,selector1,selector2)
+
+				// c[2,32-47]
+				F32_F32_BETA_OP(c_float_2p2,ir,2,2,selector1,selector2)
+
+				// c[3,0-15]
+				F32_F32_BETA_OP(c_float_3p0,ir,3,0,selector1,selector2)
+
+				// c[3,16-31]
+				F32_F32_BETA_OP(c_float_3p1,ir,3,1,selector1,selector2)
+
+				// c[3,32-47]
+				F32_F32_BETA_OP(c_float_3p2,ir,3,2,selector1,selector2)
+
+				// c[4,0-15]
+				F32_F32_BETA_OP(c_float_4p0,ir,4,0,selector1,selector2)
+
+				// c[4,16-31]
+				F32_F32_BETA_OP(c_float_4p1,ir,4,1,selector1,selector2)
+
+				// c[4,32-47]
+				F32_F32_BETA_OP(c_float_4p2,ir,4,2,selector1,selector2)
+
+				// c[5,0-15]
+				F32_F32_BETA_OP(c_float_5p0,ir,5,0,selector1,selector2)
+
+				// c[5,16-31]
+				F32_F32_BETA_OP(c_float_5p1,ir,5,1,selector1,selector2)
+
+				// c[5,32-47]
+				F32_F32_BETA_OP(c_float_5p2,ir,5,2,selector1,selector2)
+			}
+		}
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x48:
+		{
+			__m512 selector3;
+
+			if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+				 ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+			{
+				selector1 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+				selector2 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+				selector3 =
+					_mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[0, 16-31]
+				c_float_0p1 = _mm512_add_ps( selector2, c_float_0p1 );
+
+				// c[0,32-47]
+				c_float_0p2 = _mm512_add_ps( selector3, c_float_0p2 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector1, c_float_1p0 );
+
+				// c[1, 16-31]
+				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+				// c[1,32-47]
+				c_float_1p2 = _mm512_add_ps( selector3, c_float_1p2 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector1, c_float_2p0 );
+
+				// c[2, 16-31]
+				c_float_2p1 = _mm512_add_ps( selector2, c_float_2p1 );
+
+				// c[2,32-47]
+				c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector1, c_float_3p0 );
+
+				// c[3, 16-31]
+				c_float_3p1 = _mm512_add_ps( selector2, c_float_3p1 );
+
+				// c[3,32-47]
+				c_float_3p2 = _mm512_add_ps( selector3, c_float_3p2 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector1, c_float_4p0 );
+
+				// c[4, 16-31]
+				c_float_4p1 = _mm512_add_ps( selector2, c_float_4p1 );
+
+				// c[4,32-47]
+				c_float_4p2 = _mm512_add_ps( selector3, c_float_4p2 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector1, c_float_5p0 );
+
+				// c[5, 16-31]
+				c_float_5p1 = _mm512_add_ps( selector2, c_float_5p1 );
+
+				// c[5,32-47]
+				c_float_5p2 = _mm512_add_ps( selector3, c_float_5p2 );
+			}
+			else
+			{
+				selector1 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 0 ) );
+				selector2 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 1 ) );
+				selector3 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 2 ) );
+				__m512 selector4 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 3 ) );
+				__m512 selector5 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 4 ) );
+				__m512 selector6 =
+					_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1
+								+ post_ops_attr.post_op_c_i + 5 ) );
+
+				// c[0,0-15]
+				c_float_0p0 = _mm512_add_ps( selector1, c_float_0p0 );
+
+				// c[0, 16-31]
+				c_float_0p1 = _mm512_add_ps( selector1, c_float_0p1 );
+
+				// c[0,32-47]
+				c_float_0p2 = _mm512_add_ps( selector1, c_float_0p2 );
+
+				// c[1,0-15]
+				c_float_1p0 = _mm512_add_ps( selector2, c_float_1p0 );
+
+				// c[1, 16-31]
+				c_float_1p1 = _mm512_add_ps( selector2, c_float_1p1 );
+
+				// c[1,32-47]
+				c_float_1p2 = _mm512_add_ps( selector2, c_float_1p2 );
+
+				// c[2,0-15]
+				c_float_2p0 = _mm512_add_ps( selector3, c_float_2p0 );
+
+				// c[2, 16-31]
+				c_float_2p1 = _mm512_add_ps( selector3, c_float_2p1 );
+
+				// c[2,32-47]
+				c_float_2p2 = _mm512_add_ps( selector3, c_float_2p2 );
+
+				// c[3,0-15]
+				c_float_3p0 = _mm512_add_ps( selector4, c_float_3p0 );
+
+				// c[3, 16-31]
+				c_float_3p1 = _mm512_add_ps( selector4, c_float_3p1 );
+
+				// c[3,32-47]
+				c_float_3p2 = _mm512_add_ps( selector4, c_float_3p2 );
+
+				// c[4,0-15]
+				c_float_4p0 = _mm512_add_ps( selector5, c_float_4p0 );
+
+				// c[4, 16-31]
+				c_float_4p1 = _mm512_add_ps( selector5, c_float_4p1 );
+
+				// c[4,32-47]
+				c_float_4p2 = _mm512_add_ps( selector5, c_float_4p2 );
+
+				// c[5,0-15]
+				c_float_5p0 = _mm512_add_ps( selector6, c_float_5p0 );
+
+				// c[5, 16-31]
+				c_float_5p1 = _mm512_add_ps( selector6, c_float_5p1 );
+
+				// c[5,32-47]
+				c_float_5p2 = _mm512_add_ps( selector6, c_float_5p2 );
+			}
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x48:
+		{
+			//printf("relu\n");
+			selector1 = _mm512_setzero_ps();
+
+			// c[0,0-15]
+			c_float_0p0 = _mm512_max_ps( selector1, c_float_0p0 );
+
+			// c[0, 16-31]
+			c_float_0p1 = _mm512_max_ps( selector1, c_float_0p1 );
+
+			// c[0,32-47]
+			c_float_0p2 = _mm512_max_ps( selector1, c_float_0p2 );
+
+			// c[1,0-15]
+			c_float_1p0 = _mm512_max_ps( selector1, c_float_1p0 );
+
+			// c[1,16-31]
+			c_float_1p1 = _mm512_max_ps( selector1, c_float_1p1 );
+
+			// c[1,32-47]
+			c_float_1p2 = _mm512_max_ps( selector1, c_float_1p2 );
+
+			// c[2,0-15]
+			c_float_2p0 = _mm512_max_ps( selector1, c_float_2p0 );
+
+			// c[2,16-31]
+			c_float_2p1 = _mm512_max_ps( selector1, c_float_2p1 );
+
+			// c[2,32-47]
+			c_float_2p2 = _mm512_max_ps( selector1, c_float_2p2 );
+
+			// c[3,0-15]
+			c_float_3p0 = _mm512_max_ps( selector1, c_float_3p0 );
+
+			// c[3,16-31]
+			c_float_3p1 = _mm512_max_ps( selector1, c_float_3p1 );
+
+			// c[3,32-47]
+			c_float_3p2 = _mm512_max_ps( selector1, c_float_3p2 );
+
+			// c[4,0-15]
+			c_float_4p0 = _mm512_max_ps( selector1, c_float_4p0 );
+
+			// c[4,16-31]
+			c_float_4p1 = _mm512_max_ps( selector1, c_float_4p1 );
+
+			// c[4,32-47]
+			c_float_4p2 = _mm512_max_ps( selector1, c_float_4p2 );
+
+			// c[5,0-15]
+			c_float_5p0 = _mm512_max_ps( selector1, c_float_5p0 );
+
+			// c[5,16-31]
+			c_float_5p1 = _mm512_max_ps( selector1, c_float_5p1 );
+
+			// c[5,32-47]
+			c_float_5p2 = _mm512_max_ps( selector1, c_float_5p2 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x48:
+		{
+			selector1 = _mm512_setzero_ps();
+			selector2 =
+				_mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p1)
+
+			// c[0, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_0p2)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p1)
+
+			// c[1, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_1p2)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p1)
+
+			// c[2, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_2p2)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p1)
+
+			// c[3, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_3p2)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p1)
+
+			// c[4, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_4p2)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p1)
+
+			// c[5, 32-47]
+			RELU_SCALE_OP_F32_AVX512(c_float_5p2)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x48:
+		{
+			__m512 dn, z, x, r2, r, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_F32_AVX512(c_float_0p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_F32_AVX512(c_float_0p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 32-47]
+			GELU_TANH_F32_AVX512(c_float_0p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_F32_AVX512(c_float_1p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_F32_AVX512(c_float_1p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 32-47]
+			GELU_TANH_F32_AVX512(c_float_1p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_F32_AVX512(c_float_2p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_F32_AVX512(c_float_2p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 32-47]
+			GELU_TANH_F32_AVX512(c_float_2p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_F32_AVX512(c_float_3p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_F32_AVX512(c_float_3p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 32-47]
+			GELU_TANH_F32_AVX512(c_float_3p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_F32_AVX512(c_float_4p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_F32_AVX512(c_float_4p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 32-47]
+			GELU_TANH_F32_AVX512(c_float_4p2, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_F32_AVX512(c_float_5p0, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_F32_AVX512(c_float_5p1, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 32-47]
+			GELU_TANH_F32_AVX512(c_float_5p2, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x48:
+		{
+			__m512 x, r, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_F32_AVX512(c_float_0p0, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_F32_AVX512(c_float_0p1, r, x, x_erf)
+
+			// c[0, 32-47]
+			GELU_ERF_F32_AVX512(c_float_0p2, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_F32_AVX512(c_float_1p0, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_F32_AVX512(c_float_1p1, r, x, x_erf)
+
+			// c[1, 32-47]
+			GELU_ERF_F32_AVX512(c_float_1p2, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_F32_AVX512(c_float_2p0, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_F32_AVX512(c_float_2p1, r, x, x_erf)
+
+			// c[2, 32-47]
+			GELU_ERF_F32_AVX512(c_float_2p2, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_F32_AVX512(c_float_3p0, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_F32_AVX512(c_float_3p1, r, x, x_erf)
+
+			// c[3, 32-47]
+			GELU_ERF_F32_AVX512(c_float_3p2, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_F32_AVX512(c_float_4p0, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_F32_AVX512(c_float_4p1, r, x, x_erf)
+
+			// c[4, 32-47]
+			GELU_ERF_F32_AVX512(c_float_4p2, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_F32_AVX512(c_float_5p0, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_F32_AVX512(c_float_5p1, r, x, x_erf)
+
+			// c[5, 32-47]
+			GELU_ERF_F32_AVX512(c_float_5p2, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x48:
+		{
+			__m512 min = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+			__m512 max = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_F32_AVX512(c_float_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_F32_AVX512(c_float_0p1, min, max)
+
+			// c[0, 32-47]
+			CLIP_F32_AVX512(c_float_0p2, min, max)
+
+			// c[1, 0-15]
+			CLIP_F32_AVX512(c_float_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_F32_AVX512(c_float_1p1, min, max)
+
+			// c[1, 32-47]
+			CLIP_F32_AVX512(c_float_1p2, min, max)
+
+			// c[2, 0-15]
+			CLIP_F32_AVX512(c_float_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_F32_AVX512(c_float_2p1, min, max)
+
+			// c[2, 32-47]
+			CLIP_F32_AVX512(c_float_2p2, min, max)
+
+			// c[3, 0-15]
+			CLIP_F32_AVX512(c_float_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_F32_AVX512(c_float_3p1, min, max)
+
+			// c[3, 32-47]
+			CLIP_F32_AVX512(c_float_3p2, min, max)
+
+			// c[4, 0-15]
+			CLIP_F32_AVX512(c_float_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_F32_AVX512(c_float_4p1, min, max)
+
+			// c[4, 32-47]
+			CLIP_F32_AVX512(c_float_4p2, min, max)
+
+			// c[5, 0-15]
+			CLIP_F32_AVX512(c_float_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_F32_AVX512(c_float_5p1, min, max)
+
+			// c[5, 32-47]
+			CLIP_F32_AVX512(c_float_5p2, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x48:
+	{
+		// c[0, 0-15]
+		MULRND_F32(c_float_0p0,0,0);
+
+		// c[0, 16-31]
+		MULRND_F32(c_float_0p1,0,1);
+
+		// c[0, 32-47]
+		MULRND_F32(c_float_0p2,0,2);
+
+		// c[1, 0-15]
+		MULRND_F32(c_float_1p0,1,0);
+
+		// c[1, 16-31]
+		MULRND_F32(c_float_1p1,1,1);
+
+		// c[1, 32-47]
+		MULRND_F32(c_float_1p2,1,2);
+
+		// c[2, 0-15]
+		MULRND_F32(c_float_2p0,2,0);
+
+		// c[2, 16-31]
+		MULRND_F32(c_float_2p1,2,1);
+
+		// c[2, 32-47]
+		MULRND_F32(c_float_2p2,2,2);
+
+		// c[3, 0-15]
+		MULRND_F32(c_float_3p0,3,0);
+
+		// c[3, 16-31]
+		MULRND_F32(c_float_3p1,3,1);
+
+		// c[3, 32-47]
+		MULRND_F32(c_float_3p2,3,2);
+
+		// c[4, 0-15]
+		MULRND_F32(c_float_4p0,4,0);
+
+		// c[4, 16-31]
+		MULRND_F32(c_float_4p1,4,1);
+
+		// c[4, 32-47]
+		MULRND_F32(c_float_4p2,4,2);
+
+		// c[5, 0-15]
+		MULRND_F32(c_float_5p0,5,0);
+
+		// c[5, 16-31]
+		MULRND_F32(c_float_5p1,5,1);
+
+		// c[5, 32-47]
+		MULRND_F32(c_float_5p2,5,2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x48_DISABLE:
+		;
+		// Case where the output C matrix is bf16 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			__m512i selector_a = _mm512_setzero_epi32();
+			__m512i selector_b = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector_a, selector_b );
+
+			// Store the results in downscaled type (bf16 instead of float).
+
+			// c[0, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_0p0,0,0);
+
+			// c[0, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_0p1,0,1);
+
+			// c[0, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_0p2,0,2);
+
+			// c[1, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_1p0,1,0);
+
+			// c[1, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_1p1,1,1);
+
+			// c[1, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_1p2,1,2);
+
+			// c[2, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_2p0,2,0);
+
+			// c[2, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_2p1,2,1);
+
+			// c[2, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_2p2,2,2);
+
+			// c[3, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_3p0,3,0);
+
+			// c[3, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_3p1,3,1);
+
+			// c[3, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_3p2,3,2);
+
+			// c[4, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_4p0,4,0);
+
+			// c[4, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_4p1,4,1);
+
+			// c[4, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_4p2,4,2);
+
+			// c[5, 0-15]
+			CVT_STORE_F32_BF16_MASK(c_float_5p0,5,0);
+
+			// c[5, 16-31]
+			CVT_STORE_F32_BF16_MASK(c_float_5p1,5,1);
+
+			// c[5, 32-47]
+			CVT_STORE_F32_BF16_MASK(c_float_5p2,5,2);
+		}
+
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_float_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_float_0p1 );
+
+			// c[0,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_float_0p2 );
+
+			// c[1,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_float_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_float_1p1 );
+
+			// c[1,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_float_1p2 );
+
+			// c[2,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_float_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_float_2p1 );
+
+			// c[2,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_float_2p2 );
+
+			// c[3,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_float_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_float_3p1 );
+
+			// c[3,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_float_3p2 );
+
+			// c[4,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_float_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_float_4p1 );
+
+			// c[4,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_float_4p2 );
+
+			// c[5,0-15]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_float_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_float_5p1 );
+
+			// c[5,32-47]
+			_mm512_storeu_ps( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_float_5p2 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_bf16bf16f32of32_5x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_bf16bf16f32of32_4x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_bf16bf16f32of32_3x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_bf16bf16f32of32_2x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			int cs_a_use = ( cs_a == 2 ) ? 2 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_bf16bf16f32of32_1x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+#endif
+#endif
diff --git a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c
similarity index 76%
rename from addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c
rename to kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c
index 374ac3280e..fe39c8c038 100644
--- a/addon/aocl_gemm/kernels/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c
+++ b/kernels/zen4/lpgemm/bf16bf16f32/lpgemm_packb_bf16_amd512vnni.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,22 +34,10 @@
 
 #include <immintrin.h>
 #include <string.h>
-
 #include "blis.h"
-#include "lpgemm_config.h"
-#include "aocl_bf16_type.h"
-
-void get_packb_nr64_bf16bf16f32of32_strides
-     (
-       dim_t* rs_b,
-       dim_t* cs_b
-     )
-{
-	*rs_b = lpgemm_get_block_size_NR_global_cntx( BF16BF16F32OF32 ) * 2;
-	*cs_b = lpgemm_get_block_size_NR_global_cntx( BF16BF16F32OF32 ) / 2;
-}
 
-#ifdef BLIS_KERNELS_ZEN4
+#ifdef BLIS_ADDON_LPGEMM
+
 void packb_nrlt16_bf16bf16f32of32
     (
       bfloat16*       pack_b_buffer_bf16bf16f32of32,
@@ -127,10 +115,10 @@ void packb_nr64_bf16bf16f32of32
 		for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 )
 		{
 			// Rearrange for dpbf16_ps, read 2 rows from B with 64 elements in each row.
-			a0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 0 ) ) + jc  );
-			b0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 0 ) ) + jc + 32 );
-			c0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 1 ) ) + jc );
-			d0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 1 ) ) + jc + 32 );
+			a0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) + jc  );
+			b0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) + jc + 32 );
+			c0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) + jc );
+			d0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) + jc + 32 );
 
 			a01 = _mm512_unpacklo_epi16( a0, c0 );
 			a0 = _mm512_unpackhi_epi16( a0, c0 );
@@ -144,16 +132,16 @@ void packb_nr64_bf16bf16f32of32
 			c0 = _mm512_permutex2var_epi64( c01, selector1_1, c0 );
 
 			//store to pack_b buffer     
-			_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ), b0 );
-			_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) + 32, a0 );
-			_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ), d0 );
-			_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) + 32, c0 );
+			_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ), b0 );
+			_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) + 32, a0 );
+			_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ), d0 );
+			_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) + 32, c0 );
 		}  
 		// Handle k remainder.
 		if( k_partial_pieces > 0)
 		{
-			a0 = _mm512_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) + jc  );
-			b0 = _mm512_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) + jc + 32 );
+			a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc  );
+			b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc + 32 );
 			c0 = _mm512_setzero_si512();
 			d0 = _mm512_setzero_si512();
 
@@ -169,10 +157,10 @@ void packb_nr64_bf16bf16f32of32
 			c0 = _mm512_permutex2var_epi64( c01, selector1_1, c0 );
 
 			//store to pack_b buffer     
-			_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ), b0 );
-			_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) + 32, a0 );
-			_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ), d0 );
-			_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) + 32, c0 );
+			_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ), b0 );
+			_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) + 32, a0 );
+			_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ), d0 );
+			_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) + 32, c0 );
 		}  
 	}
 
@@ -268,8 +256,8 @@ void packb_nr48_bf16bf16f32of32
 	for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 )
 	{
 		// Rearrange for dpbf16_ps, read 2 rows from B with 32 elements in each row.
-		a0x = _mm512_loadu_epi16( b + ( ldb * ( kr + 0 ) )  );          
-		c0x = _mm512_loadu_epi16( b + ( ldb * ( kr + 1 ) )  );
+		a0x = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) )  );          
+		c0x = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) )  );
 
 		a01x = _mm512_unpacklo_epi16( a0x, c0x );
 		a0x = _mm512_unpackhi_epi16( a0x, c0x );
@@ -278,12 +266,12 @@ void packb_nr48_bf16bf16f32of32
 		a0x = _mm512_permutex2var_epi64( a01x, selector1_1, a0x );
 
 		//First 2x32 elements
-		_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x );  
-		_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x );  	
+		_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x );  
+		_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x );  	
 
 		// Rearrange for dpbf16_ps, read 2 rows from B with next 16 elements in each row.
-		a0 = _mm256_loadu_epi16( b + ( ldb * ( kr + 0 ) ) + NR1 );  
-		c0 = _mm256_loadu_epi16( b + ( ldb * ( kr + 1 ) ) + NR1 );
+		a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 0 ) ) + NR1 );  
+		c0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 1 ) ) + NR1 );
 
 		a01 = _mm256_unpacklo_epi16( a0, c0 );
 		a0 = _mm256_unpackhi_epi16( a0, c0 );
@@ -292,15 +280,23 @@ void packb_nr48_bf16bf16f32of32
 		a0 = _mm256_permute2f128_si256(a01, a0, 0x31);
 
 		//Last 2x16 elements                
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ), b0 );        
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ) + NR2, a0 );  
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ),
+		  0xFF, b0
+		);
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ) + NR2,
+		  0xFF, a0
+		);
 
 		kr_new += 3;
 	}
 	// Handle k remainder.
 	if ( k_partial_pieces > 0 )
 	{
-		a0x = _mm512_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) )  );          
+		a0x = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) )  );          
 		c0x = _mm512_setzero_si512();
 
 		a01x = _mm512_unpacklo_epi16( a0x, c0x );
@@ -310,10 +306,10 @@ void packb_nr48_bf16bf16f32of32
 		a0x = _mm512_permutex2var_epi64( a01x, selector1_1, a0x );
 
 		//First 2x32 elements
-		_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); 
-		_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x );  	
+		_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR1 ), b0x ); 
+		_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR1 ), a0x );  	
 
-		a0 = _mm256_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) ) + NR1 );  
+		a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + NR1 );  
 		c0 = _mm256_setzero_si256();
 
 		a01 = _mm256_unpacklo_epi16( a0, c0 );
@@ -323,8 +319,16 @@ void packb_nr48_bf16bf16f32of32
 		a0 = _mm256_permute2f128_si256(a01, a0, 0x31);
 
 		//Last 2x16 elements                
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ), b0 );        
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ) + NR2, a0 );  
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ),
+		  0xFF, b0
+		);
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 2 ) * NR1 ) + NR2,
+		  0xFF, a0
+		);
 	}
 }
 
@@ -356,8 +360,8 @@ void packb_nr32_bf16bf16f32of32
 	for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 )
 	{
 		// Rearrange for dpbf16_ps, read 2 rows from B with 32 elements in each row.
-		a0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 0 ) )  );
-		c0 = _mm512_loadu_epi16( b + ( ldb * ( kr + 1 ) ) );
+		a0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) )  );
+		c0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) );
 
 		a01 = _mm512_unpacklo_epi16( a0, c0 );
 		a0 = _mm512_unpackhi_epi16( a0, c0 );
@@ -365,15 +369,15 @@ void packb_nr32_bf16bf16f32of32
 		b0 = _mm512_permutex2var_epi64( a01, selector1, a0 );
 		a0 = _mm512_permutex2var_epi64( a01, selector1_1, a0 );
 
-		_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 );
-		_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 );
+		_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 );
+		_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 );
 
 		kr_new += 2;
 	}   
 	// Handle k remainder.
 	if ( k_partial_pieces > 0 )
 	{
-		a0 = _mm512_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) )  );
+		a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) )  );
 		c0 = _mm512_setzero_si512();
 
 		a01 = _mm512_unpacklo_epi16( a0, c0 );
@@ -382,8 +386,8 @@ void packb_nr32_bf16bf16f32of32
 		b0 = _mm512_permutex2var_epi64( a01, selector1, a0 );
 		a0 = _mm512_permutex2var_epi64( a01, selector1_1, a0 );
 
-		_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 );
-		_mm512_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 );
+		_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new ) * NR ), b0 );
+		_mm512_storeu_si512( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 );
 	}
 }    
 
@@ -411,8 +415,8 @@ void packb_nr16_bf16bf16f32of32
 	for ( dim_t kr = 0; kr < k_full_pieces; kr += 2 )
 	{   
 		// Rearrange for dpbf16_ps, read 2 rows from B with 16 elements in each row.
-		a0 = _mm256_loadu_epi16( b + ( ldb * ( kr + 0 ) )  );
-		c0 = _mm256_loadu_epi16( b + ( ldb * ( kr + 1 ) )  );	
+		a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 0 ) )  );
+		c0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( kr + 1 ) )  );	
 
 		a01 = _mm256_unpacklo_epi16( a0, c0 );               
 		a0 = _mm256_unpackhi_epi16( a0, c0 );
@@ -420,15 +424,23 @@ void packb_nr16_bf16bf16f32of32
 		b0 = _mm256_permute2f128_si256(a01, a0, 0x20);
 		a0 = _mm256_permute2f128_si256(a01, a0, 0x31);
 
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), b0 );
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 );
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ),
+		  0xFF, b0
+		);
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ),
+		  0xFF, a0
+		);
 
 		kr_new += 2;
 	}
 	// Handle k remainder.
 	if ( k_partial_pieces > 0 )
 	{
-		a0 = _mm256_loadu_epi16( b + ( ldb * ( k_full_pieces + 0 ) )  );
+		a0 = _mm256_maskz_loadu_epi16( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) )  );
 		c0 = _mm256_setzero_si256();
 
 		a01 = _mm256_unpacklo_epi16( a0, c0 );               
@@ -437,8 +449,16 @@ void packb_nr16_bf16bf16f32of32
 		b0 = _mm256_permute2f128_si256(a01, a0, 0x20);
 		a0 = _mm256_permute2f128_si256(a01, a0, 0x31);
 
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), b0 );
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 );
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ),
+		  0xFF, b0
+		);
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ),
+		  0xFF, a0
+		);
 	}    
 }    
 
@@ -472,8 +492,8 @@ void packb_nrlt16_bf16bf16f32of32
 		memcpy( buf0, ( b + ( ldb * ( kr + 0 ) ) ), ( n0_partial_rem * sizeof( bfloat16 ) ) );
 		memcpy( buf1, ( b + ( ldb * ( kr + 1 ) ) ), ( n0_partial_rem * sizeof( bfloat16 ) ) );
 		// Rearrange for dpbf16_ps, read 2 rows from B with next 16 elements in each row.
-		a0 = _mm256_loadu_epi16( buf0 );
-		c0 = _mm256_loadu_epi16( buf1 );
+		a0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf0 );
+		c0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf1 );
 
 		a01 = _mm256_unpacklo_epi16( a0, c0 );               
 		a0 = _mm256_unpackhi_epi16( a0, c0 );
@@ -481,8 +501,16 @@ void packb_nrlt16_bf16bf16f32of32
 		b0 = _mm256_permute2f128_si256(a01, a0, 0x20);
 		a0 = _mm256_permute2f128_si256(a01, a0, 0x31);
 
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), b0 );
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 );
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ),
+		  0xFF, b0
+		);
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ),
+		  0xFF, a0
+		);
 
 		kr_new += 2;
 	}
@@ -490,7 +518,7 @@ void packb_nrlt16_bf16bf16f32of32
 	if ( k_partial_pieces > 0 )
 	{
 		memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( bfloat16 ) ) );
-		a0 = _mm256_loadu_epi16( buf0 );
+		a0 = _mm256_maskz_loadu_epi16( 0xFFFF, buf0 );
 		c0 = _mm256_setzero_si256();
 
 		a01 = _mm256_unpacklo_epi16( a0, c0 );               
@@ -499,8 +527,16 @@ void packb_nrlt16_bf16bf16f32of32
 		b0 = _mm256_permute2f128_si256(a01, a0, 0x20);
 		a0 = _mm256_permute2f128_si256(a01, a0, 0x31);
 
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ), b0 );
-		_mm256_storeu_epi64( pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ), a0 );
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 0 ) * NR ),
+		  0xFF, b0
+		);
+		_mm256_mask_storeu_epi64
+		(
+		  pack_b_buffer_bf16bf16f32of32 + ( ( kr_new + 1 ) * NR ),
+		  0xFF, a0
+		);
 	}    
 }
 #endif
diff --git a/kernels/zen4/lpgemm/f32f32f32/lpgemm_fringe_f32_avx512.c b/kernels/zen4/lpgemm/f32f32f32/lpgemm_fringe_f32_avx512.c
new file mode 100644
index 0000000000..70ac7f9b90
--- /dev/null
+++ b/kernels/zen4/lpgemm/f32f32f32/lpgemm_fringe_f32_avx512.c
@@ -0,0 +1,5642 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+  Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "immintrin.h"
+#include "xmmintrin.h"
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_kernel_macros_f32.h"
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_5x64)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_5x64F_DISABLE,
+              &&POST_OPS_BIAS_5x64F,
+              &&POST_OPS_RELU_5x64F,
+              &&POST_OPS_RELU_SCALE_5x64F,
+              &&POST_OPS_GELU_TANH_5x64F,
+              &&POST_OPS_GELU_ERF_5x64F,
+              &&POST_OPS_CLIP_5x64F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+    __m512 zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
+    __m512 zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
+    __m512 zmm24, zmm25, zmm26, zmm27;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm11);
+    ZERO_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15);
+    ZERO_ACC_ZMM_4_REG(zmm16, zmm17, zmm18, zmm19);
+    ZERO_ACC_ZMM_4_REG(zmm20, zmm21, zmm22, zmm23);
+    ZERO_ACC_ZMM_4_REG(zmm24, zmm25, zmm26, zmm27);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm7, zmm2, zmm11);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm7, zmm3, zmm15);
+
+        zmm2 = _mm512_set1_ps(*(abuf + 4*rs_a)); //broadcast c0r4
+        
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        zmm18 = _mm512_fmadd_ps(zmm6, zmm4, zmm18);
+        zmm19 = _mm512_fmadd_ps(zmm7, zmm4, zmm19);
+        
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+        zmm22 = _mm512_fmadd_ps(zmm6, zmm5, zmm22);
+        zmm23 = _mm512_fmadd_ps(zmm7, zmm5, zmm23);
+
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm2, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm2, zmm25);
+        zmm26 = _mm512_fmadd_ps(zmm6, zmm2, zmm26);
+        zmm27 = _mm512_fmadd_ps(zmm7, zmm2, zmm27);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,  zmm9,  zmm10, zmm11, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm16, zmm17, zmm18, zmm19, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm20, zmm21, zmm22, zmm23, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm24, zmm25, zmm26, zmm27, zmm0);
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf; 
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm1, zmm3, zmm11);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm1, zmm3, zmm15);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm18 = _mm512_fmadd_ps(zmm0, zmm3, zmm18);
+        zmm19 = _mm512_fmadd_ps(zmm1, zmm3, zmm19);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm22 = _mm512_fmadd_ps(zmm0, zmm3, zmm22);
+        zmm23 = _mm512_fmadd_ps(zmm1, zmm3, zmm23);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm3, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm3, zmm25);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm26 = _mm512_fmadd_ps(zmm0, zmm3, zmm26);
+        zmm27 = _mm512_fmadd_ps(zmm1, zmm3, zmm27);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_5x64F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+        zmm4 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm4, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_add_ps( zmm4, zmm15 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[2,48-63]
+        zmm19 = _mm512_add_ps( zmm4, zmm19 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm2, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_add_ps( zmm3, zmm22 );
+
+        // c[3,48-63]
+        zmm23 = _mm512_add_ps( zmm4, zmm23 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_add_ps( zmm1, zmm24 );
+
+        // c[4, 16-31]
+        zmm25 = _mm512_add_ps( zmm2, zmm25 );
+
+        // c[4,32-47]
+        zmm26 = _mm512_add_ps( zmm3, zmm26 );
+
+        // c[4,48-63]
+        zmm27 = _mm512_add_ps( zmm4, zmm27 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+        zmm4 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 ) );
+        zmm5 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 4 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm1, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_add_ps( zmm2, zmm15 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[2,48-63]
+        zmm19 = _mm512_add_ps( zmm3, zmm19 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm4, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_add_ps( zmm4, zmm22 );
+
+        // c[3,48-63]
+        zmm23 = _mm512_add_ps( zmm4, zmm23 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_add_ps( zmm5, zmm24 );
+
+        // c[4, 16-31]
+        zmm25 = _mm512_add_ps( zmm5, zmm25 );
+
+        // c[4,32-47]
+        zmm26 = _mm512_add_ps( zmm5, zmm26 );
+
+        // c[4,48-63]
+        zmm27 = _mm512_add_ps( zmm5, zmm27 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_5x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[0,48-63]
+      zmm11 = _mm512_max_ps( zmm1, zmm11 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[1,32-47]
+      zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+      // c[1,48-63]
+      zmm15 = _mm512_max_ps( zmm1, zmm15 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      // c[2,32-47]
+      zmm18 = _mm512_max_ps( zmm1, zmm18 );
+
+      // c[2,48-63]
+      zmm19 = _mm512_max_ps( zmm1, zmm19 );
+
+      // c[3,0-15]
+      zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+      // c[3,16-31]
+      zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+      // c[3,32-47]
+      zmm22 = _mm512_max_ps( zmm1, zmm22 );
+
+      // c[3,48-63]
+      zmm23 = _mm512_max_ps( zmm1, zmm23 );
+
+      // c[4,0-15]
+      zmm24 = _mm512_max_ps( zmm1, zmm24 );
+
+      // c[4,16-31]
+      zmm25 = _mm512_max_ps( zmm1, zmm25 );
+
+      // c[4,32-47]
+      zmm26 = _mm512_max_ps( zmm1, zmm26 );
+
+      // c[4,48-63]
+      zmm27 = _mm512_max_ps( zmm1, zmm27 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_5x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[0, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm11)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[1, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+      // c[1, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm15)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      // c[2, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm18)
+
+      // c[2, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm19)
+
+      // c[3, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+      // c[3, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+      // c[3, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm22)
+
+      // c[3, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm23)
+
+      // c[4, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm24)
+
+      // c[4, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm25)
+
+      // c[4, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm26)
+
+      // c[4, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm27)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_5x64F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 48-63]
+      GELU_TANH_F32S_AVX512(zmm11, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 32-47]
+      GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 48-63]
+      GELU_TANH_F32S_AVX512(zmm15, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 32-47]
+      GELU_TANH_F32S_AVX512(zmm18, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 48-63]
+      GELU_TANH_F32S_AVX512(zmm19, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 0-15]
+      GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 16-31]
+      GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 32-47]
+      GELU_TANH_F32S_AVX512(zmm22, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 48-63]
+      GELU_TANH_F32S_AVX512(zmm23, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 0-15]
+      GELU_TANH_F32S_AVX512(zmm24, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 16-31]
+      GELU_TANH_F32S_AVX512(zmm25, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 32-47]
+      GELU_TANH_F32S_AVX512(zmm26, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 48-63]
+      GELU_TANH_F32S_AVX512(zmm27, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_5x64F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[0, 48-63]
+      GELU_ERF_F32S_AVX512(zmm11, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[1, 32-47]
+      GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+      // c[1, 48-63]
+      GELU_ERF_F32S_AVX512(zmm15, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      // c[2, 32-47]
+      GELU_ERF_F32S_AVX512(zmm18, zmm0, zmm1, zmm2)
+
+      // c[2, 48-63]
+      GELU_ERF_F32S_AVX512(zmm19, zmm0, zmm1, zmm2)
+
+      // c[3, 0-15]
+      GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+      // c[3, 16-31]
+      GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+      // c[3, 32-47]
+      GELU_ERF_F32S_AVX512(zmm22, zmm0, zmm1, zmm2)
+
+      // c[3, 48-63]
+      GELU_ERF_F32S_AVX512(zmm23, zmm0, zmm1, zmm2)
+
+      // c[4, 0-15]
+      GELU_ERF_F32S_AVX512(zmm24, zmm0, zmm1, zmm2)
+
+      // c[4, 16-31]
+      GELU_ERF_F32S_AVX512(zmm25, zmm0, zmm1, zmm2)
+
+      // c[4, 32-47]
+      GELU_ERF_F32S_AVX512(zmm26, zmm0, zmm1, zmm2)
+
+      // c[4, 48-63]
+      GELU_ERF_F32S_AVX512(zmm27, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_5x64F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[0, 48-63]
+      CLIP_F32S_AVX512(zmm11, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[1, 32-47]
+      CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+      // c[1, 48-63]
+      CLIP_F32S_AVX512(zmm15, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      // c[2, 32-47]
+      CLIP_F32S_AVX512(zmm18, zmm0, zmm1)
+
+      // c[2, 48-63]
+      CLIP_F32S_AVX512(zmm19, zmm0, zmm1)
+
+      // c[3, 0-15]
+      CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+      // c[3, 16-31]
+      CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+      // c[3, 32-47]
+      CLIP_F32S_AVX512(zmm22, zmm0, zmm1)
+
+      // c[3, 48-63]
+      CLIP_F32S_AVX512(zmm23, zmm0, zmm1)
+
+      // c[4, 0-15]
+      CLIP_F32S_AVX512(zmm24, zmm0, zmm1)
+
+      // c[4, 16-31]
+      CLIP_F32S_AVX512(zmm25, zmm0, zmm1)
+
+      // c[4, 32-47]
+      CLIP_F32S_AVX512(zmm26, zmm0, zmm1)
+
+      // c[4, 48-63]
+      CLIP_F32S_AVX512(zmm27, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_5x64F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    _mm512_storeu_ps(cbuf + 48, zmm11);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    _mm512_storeu_ps(cbuf + 32, zmm14);
+    _mm512_storeu_ps(cbuf + 48, zmm15);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+    _mm512_storeu_ps(cbuf + 32, zmm18);
+    _mm512_storeu_ps(cbuf + 48, zmm19);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm20);
+    _mm512_storeu_ps(cbuf + 16, zmm21);
+    _mm512_storeu_ps(cbuf + 32, zmm22);
+    _mm512_storeu_ps(cbuf + 48, zmm23);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm24);
+    _mm512_storeu_ps(cbuf + 16, zmm25);
+    _mm512_storeu_ps(cbuf + 32, zmm26);
+    _mm512_storeu_ps(cbuf + 48, zmm27);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x64)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_4x64F_DISABLE,
+              &&POST_OPS_BIAS_4x64F,
+              &&POST_OPS_RELU_4x64F,
+              &&POST_OPS_RELU_SCALE_4x64F,
+              &&POST_OPS_GELU_TANH_4x64F,
+              &&POST_OPS_GELU_ERF_4x64F,
+              &&POST_OPS_CLIP_4x64F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+    __m512 zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
+    __m512 zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm11);
+    ZERO_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15);
+    ZERO_ACC_ZMM_4_REG(zmm16, zmm17, zmm18, zmm19);
+    ZERO_ACC_ZMM_4_REG(zmm20, zmm21, zmm22, zmm23);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm7, zmm2, zmm11);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm7, zmm3, zmm15);
+
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        zmm18 = _mm512_fmadd_ps(zmm6, zmm4, zmm18);
+        zmm19 = _mm512_fmadd_ps(zmm7, zmm4, zmm19);
+        
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+        zmm22 = _mm512_fmadd_ps(zmm6, zmm5, zmm22);
+        zmm23 = _mm512_fmadd_ps(zmm7, zmm5, zmm23);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,  zmm9,  zmm10, zmm11, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm16, zmm17, zmm18, zmm19, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm20, zmm21, zmm22, zmm23, zmm0);
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf; 
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm1, zmm3, zmm11);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm1, zmm3, zmm15);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm18 = _mm512_fmadd_ps(zmm0, zmm3, zmm18);
+        zmm19 = _mm512_fmadd_ps(zmm1, zmm3, zmm19);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm22 = _mm512_fmadd_ps(zmm0, zmm3, zmm22);
+        zmm23 = _mm512_fmadd_ps(zmm1, zmm3, zmm23);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_4x64F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+        zmm4 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm4, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_add_ps( zmm4, zmm15 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[2,48-63]
+        zmm19 = _mm512_add_ps( zmm4, zmm19 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm2, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_add_ps( zmm3, zmm22 );
+
+        // c[3,48-63]
+        zmm23 = _mm512_add_ps( zmm4, zmm23 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+        zmm4 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm1, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_add_ps( zmm2, zmm15 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[2,48-63]
+        zmm19 = _mm512_add_ps( zmm3, zmm19 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm4, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_add_ps( zmm4, zmm22 );
+
+        // c[3,48-63]
+        zmm23 = _mm512_add_ps( zmm4, zmm23 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_4x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[0,48-63]
+      zmm11 = _mm512_max_ps( zmm1, zmm11 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[1,32-47]
+      zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+      // c[1,48-63]
+      zmm15 = _mm512_max_ps( zmm1, zmm15 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      // c[2,32-47]
+      zmm18 = _mm512_max_ps( zmm1, zmm18 );
+
+      // c[2,48-63]
+      zmm19 = _mm512_max_ps( zmm1, zmm19 );
+
+      // c[3,0-15]
+      zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+      // c[3,16-31]
+      zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+      // c[3,32-47]
+      zmm22 = _mm512_max_ps( zmm1, zmm22 );
+
+      // c[3,48-63]
+      zmm23 = _mm512_max_ps( zmm1, zmm23 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_4x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[0, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm11)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[1, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+      // c[1, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm15)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      // c[2, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm18)
+
+      // c[2, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm19)
+
+      // c[3, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+      // c[3, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+      // c[3, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm22)
+
+      // c[3, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm23)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_4x64F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 48-63]
+      GELU_TANH_F32S_AVX512(zmm11, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 32-47]
+      GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 48-63]
+      GELU_TANH_F32S_AVX512(zmm15, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 32-47]
+      GELU_TANH_F32S_AVX512(zmm18, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 48-63]
+      GELU_TANH_F32S_AVX512(zmm19, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 0-15]
+      GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 16-31]
+      GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 32-47]
+      GELU_TANH_F32S_AVX512(zmm22, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 48-63]
+      GELU_TANH_F32S_AVX512(zmm23, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_4x64F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[0, 48-63]
+      GELU_ERF_F32S_AVX512(zmm11, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[1, 32-47]
+      GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+      // c[1, 48-63]
+      GELU_ERF_F32S_AVX512(zmm15, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      // c[2, 32-47]
+      GELU_ERF_F32S_AVX512(zmm18, zmm0, zmm1, zmm2)
+
+      // c[2, 48-63]
+      GELU_ERF_F32S_AVX512(zmm19, zmm0, zmm1, zmm2)
+
+      // c[3, 0-15]
+      GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+      // c[3, 16-31]
+      GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+      // c[3, 32-47]
+      GELU_ERF_F32S_AVX512(zmm22, zmm0, zmm1, zmm2)
+
+      // c[3, 48-63]
+      GELU_ERF_F32S_AVX512(zmm23, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_4x64F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[0, 48-63]
+      CLIP_F32S_AVX512(zmm11, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[1, 32-47]
+      CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+      // c[1, 48-63]
+      CLIP_F32S_AVX512(zmm15, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      // c[2, 32-47]
+      CLIP_F32S_AVX512(zmm18, zmm0, zmm1)
+
+      // c[2, 48-63]
+      CLIP_F32S_AVX512(zmm19, zmm0, zmm1)
+
+      // c[3, 0-15]
+      CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+      // c[3, 16-31]
+      CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+      // c[3, 32-47]
+      CLIP_F32S_AVX512(zmm22, zmm0, zmm1)
+
+      // c[3, 48-63]
+      CLIP_F32S_AVX512(zmm23, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_4x64F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    _mm512_storeu_ps(cbuf + 48, zmm11);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    _mm512_storeu_ps(cbuf + 32, zmm14);
+    _mm512_storeu_ps(cbuf + 48, zmm15);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+    _mm512_storeu_ps(cbuf + 32, zmm18);
+    _mm512_storeu_ps(cbuf + 48, zmm19);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm20);
+    _mm512_storeu_ps(cbuf + 16, zmm21);
+    _mm512_storeu_ps(cbuf + 32, zmm22);
+    _mm512_storeu_ps(cbuf + 48, zmm23);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x64)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_3x64F_DISABLE,
+              &&POST_OPS_BIAS_3x64F,
+              &&POST_OPS_RELU_3x64F,
+              &&POST_OPS_RELU_SCALE_3x64F,
+              &&POST_OPS_GELU_TANH_3x64F,
+              &&POST_OPS_GELU_ERF_3x64F,
+              &&POST_OPS_CLIP_3x64F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+    __m512 zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
+    __m512 zmm16, zmm17, zmm18, zmm19;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm11);
+    ZERO_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15);
+    ZERO_ACC_ZMM_4_REG(zmm16, zmm17, zmm18, zmm19);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm7, zmm2, zmm11);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm7, zmm3, zmm15);
+
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        zmm18 = _mm512_fmadd_ps(zmm6, zmm4, zmm18);
+        zmm19 = _mm512_fmadd_ps(zmm7, zmm4, zmm19);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,  zmm9,  zmm10, zmm11, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm16, zmm17, zmm18, zmm19, zmm0);
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm1, zmm3, zmm11);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm1, zmm3, zmm15);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm18 = _mm512_fmadd_ps(zmm0, zmm3, zmm18);
+        zmm19 = _mm512_fmadd_ps(zmm1, zmm3, zmm19);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_3x64F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+        zmm4 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm4, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_add_ps( zmm4, zmm15 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[2,48-63]
+        zmm19 = _mm512_add_ps( zmm4, zmm19 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm1, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_add_ps( zmm2, zmm15 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[2,48-63]
+        zmm19 = _mm512_add_ps( zmm3, zmm19 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_3x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[0,48-63]
+      zmm11 = _mm512_max_ps( zmm1, zmm11 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[1,32-47]
+      zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+      // c[1,48-63]
+      zmm15 = _mm512_max_ps( zmm1, zmm15 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      // c[2,32-47]
+      zmm18 = _mm512_max_ps( zmm1, zmm18 );
+
+      // c[2,48-63]
+      zmm19 = _mm512_max_ps( zmm1, zmm19 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_3x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[0, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm11)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[1, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+      // c[1, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm15)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      // c[2, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm18)
+
+      // c[2, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm19)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_3x64F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 48-63]
+      GELU_TANH_F32S_AVX512(zmm11, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 32-47]
+      GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 48-63]
+      GELU_TANH_F32S_AVX512(zmm15, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 32-47]
+      GELU_TANH_F32S_AVX512(zmm18, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 48-63]
+      GELU_TANH_F32S_AVX512(zmm19, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_3x64F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[0, 48-63]
+      GELU_ERF_F32S_AVX512(zmm11, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[1, 32-47]
+      GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+      // c[1, 48-63]
+      GELU_ERF_F32S_AVX512(zmm15, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      // c[2, 32-47]
+      GELU_ERF_F32S_AVX512(zmm18, zmm0, zmm1, zmm2)
+
+      // c[2, 48-63]
+      GELU_ERF_F32S_AVX512(zmm19, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_3x64F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[0, 48-63]
+      CLIP_F32S_AVX512(zmm11, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[1, 32-47]
+      CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+      // c[1, 48-63]
+      CLIP_F32S_AVX512(zmm15, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      // c[2, 32-47]
+      CLIP_F32S_AVX512(zmm18, zmm0, zmm1)
+
+      // c[2, 48-63]
+      CLIP_F32S_AVX512(zmm19, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_3x64F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    _mm512_storeu_ps(cbuf + 48, zmm11);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    _mm512_storeu_ps(cbuf + 32, zmm14);
+    _mm512_storeu_ps(cbuf + 48, zmm15);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+    _mm512_storeu_ps(cbuf + 32, zmm18);
+    _mm512_storeu_ps(cbuf + 48, zmm19);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x64)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_2x64F_DISABLE,
+              &&POST_OPS_BIAS_2x64F,
+              &&POST_OPS_RELU_2x64F,
+              &&POST_OPS_RELU_SCALE_2x64F,
+              &&POST_OPS_GELU_TANH_2x64F,
+              &&POST_OPS_GELU_ERF_2x64F,
+              &&POST_OPS_CLIP_2x64F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+    __m512 zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm11);
+    ZERO_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1 
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm7, zmm2, zmm11);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm7, zmm3, zmm15);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,  zmm9,  zmm10, zmm11, zmm0);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15, zmm0);
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm1, zmm3, zmm11);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm1, zmm3, zmm15);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_2x64F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+        zmm4 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm4, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_add_ps( zmm4, zmm15 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm1, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_add_ps( zmm2, zmm15 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_2x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[0,48-63]
+      zmm11 = _mm512_max_ps( zmm1, zmm11 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[1,32-47]
+      zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+      // c[1,48-63]
+      zmm15 = _mm512_max_ps( zmm1, zmm15 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_2x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[0, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm11)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[1, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+      // c[1, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm15)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_2x64F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 48-63]
+      GELU_TANH_F32S_AVX512(zmm11, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 32-47]
+      GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 48-63]
+      GELU_TANH_F32S_AVX512(zmm15, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_2x64F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[0, 48-63]
+      GELU_ERF_F32S_AVX512(zmm11, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[1, 32-47]
+      GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+      // c[1, 48-63]
+      GELU_ERF_F32S_AVX512(zmm15, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_2x64F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[0, 48-63]
+      CLIP_F32S_AVX512(zmm11, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[1, 32-47]
+      CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+      // c[1, 48-63]
+      CLIP_F32S_AVX512(zmm15, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_2x64F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    _mm512_storeu_ps(cbuf + 48, zmm11);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    _mm512_storeu_ps(cbuf + 32, zmm14);
+    _mm512_storeu_ps(cbuf + 48, zmm15);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_1x64)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_1x64F_DISABLE,
+              &&POST_OPS_BIAS_1x64F,
+              &&POST_OPS_RELU_1x64F,
+              &&POST_OPS_RELU_SCALE_1x64F,
+              &&POST_OPS_GELU_TANH_1x64F,
+              &&POST_OPS_GELU_ERF_1x64F,
+              &&POST_OPS_CLIP_1x64F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+    __m512 zmm8, zmm9, zmm10, zmm11;
+    
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm11);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm7, zmm2, zmm11);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,  zmm9,  zmm10, zmm11, zmm0);
+
+    if ( beta != 0.0 )
+    { 
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(cbuf);
+        zmm1 = _mm512_load_ps(cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+        zmm0 = _mm512_load_ps(cbuf + 32);
+        zmm1 = _mm512_load_ps(cbuf + 48);
+        zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm1, zmm3, zmm11);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_1x64F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+        zmm4 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm4, zmm11 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_add_ps( zmm1, zmm11 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_1x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[0,48-63]
+      zmm11 = _mm512_max_ps( zmm1, zmm11 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_1x64F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[0, 48-63]
+      RELU_SCALE_OP_F32S_AVX512(zmm11)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_1x64F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 48-63]
+      GELU_TANH_F32S_AVX512(zmm11, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_1x64F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[0, 48-63]
+      GELU_ERF_F32S_AVX512(zmm11, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_1x64F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[0, 48-63]
+      CLIP_F32S_AVX512(zmm11, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_1x64F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    _mm512_storeu_ps(cbuf + 48, zmm11);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_5x48)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_5x48F_DISABLE,
+              &&POST_OPS_BIAS_5x48F,
+              &&POST_OPS_RELU_5x48F,
+              &&POST_OPS_RELU_SCALE_5x48F,
+              &&POST_OPS_GELU_TANH_5x48F,
+              &&POST_OPS_GELU_ERF_5x48F,
+              &&POST_OPS_CLIP_5x48F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6;
+    __m512 zmm8, zmm9, zmm10, zmm12, zmm13, zmm14;
+    __m512 zmm16, zmm17, zmm18, zmm20, zmm21, zmm22;
+    __m512 zmm24, zmm25, zmm26, zmm28;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm12);
+    ZERO_ACC_ZMM_4_REG(zmm13, zmm14,zmm16, zmm17);
+    ZERO_ACC_ZMM_4_REG(zmm18, zmm20, zmm21, zmm22);
+    ZERO_ACC_ZMM_4_REG(zmm24, zmm25, zmm26, zmm28);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+
+        zmm2 = _mm512_set1_ps(*(abuf + 4*rs_a)); //broadcast c0r4
+        
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        zmm18 = _mm512_fmadd_ps(zmm6, zmm4, zmm18);
+        
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+        zmm22 = _mm512_fmadd_ps(zmm6, zmm5, zmm22);
+
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm2, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm2, zmm25);
+        zmm26 = _mm512_fmadd_ps(zmm6, zmm2, zmm26);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm10,zmm12,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm13,zmm14,zmm16,zmm17,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm18,zmm20,zmm21,zmm22,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm24,zmm25,zmm26,zmm28,zmm0)
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      zmm3 = _mm512_set1_ps(beta);
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf + 16);
+      zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+      zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf + 16);
+      zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+      zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf+16);
+      zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+      zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm18 = _mm512_fmadd_ps(zmm0, zmm3, zmm18);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf+16);
+      zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+      zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm22 = _mm512_fmadd_ps(zmm0, zmm3, zmm22);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf+16);
+      zmm24 = _mm512_fmadd_ps(zmm0, zmm3, zmm24);
+      zmm25 = _mm512_fmadd_ps(zmm1, zmm3, zmm25);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm26 = _mm512_fmadd_ps(zmm0, zmm3, zmm26);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_5x48F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm2, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_add_ps( zmm3, zmm22 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_add_ps( zmm1, zmm24 );
+
+        // c[4, 16-31]
+        zmm25 = _mm512_add_ps( zmm2, zmm25 );
+
+        // c[4,32-47]
+        zmm26 = _mm512_add_ps( zmm3, zmm26 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+        zmm4 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 ) );
+        zmm5 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 4 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm4, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_add_ps( zmm4, zmm22 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_add_ps( zmm5, zmm24 );
+
+        // c[4, 16-31]
+        zmm25 = _mm512_add_ps( zmm5, zmm25 );
+
+        // c[4,32-47]
+        zmm26 = _mm512_add_ps( zmm5, zmm26 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_5x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[1,32-47]
+      zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      // c[2,32-47]
+      zmm18 = _mm512_max_ps( zmm1, zmm18 );
+
+      // c[3,0-15]
+      zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+      // c[3,16-31]
+      zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+      // c[3,32-47]
+      zmm22 = _mm512_max_ps( zmm1, zmm22 );
+
+      // c[4,0-15]
+      zmm24 = _mm512_max_ps( zmm1, zmm24 );
+
+      // c[4,16-31]
+      zmm25 = _mm512_max_ps( zmm1, zmm25 );
+
+      // c[4,32-47]
+      zmm26 = _mm512_max_ps( zmm1, zmm26 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_5x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[1, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      // c[2, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm18)
+
+      // c[3, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+      // c[3, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+      // c[3, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm22)
+
+      // c[4, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm24)
+
+      // c[4, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm25)
+
+      // c[4, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm26)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_5x48F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 32-47]
+      GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 32-47]
+      GELU_TANH_F32S_AVX512(zmm18, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 0-15]
+      GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 16-31]
+      GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 32-47]
+      GELU_TANH_F32S_AVX512(zmm22, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 0-15]
+      GELU_TANH_F32S_AVX512(zmm24, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 16-31]
+      GELU_TANH_F32S_AVX512(zmm25, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 32-47]
+      GELU_TANH_F32S_AVX512(zmm26, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_5x48F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[1, 32-47]
+      GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      // c[2, 32-47]
+      GELU_ERF_F32S_AVX512(zmm18, zmm0, zmm1, zmm2)
+
+      // c[3, 0-15]
+      GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+      // c[3, 16-31]
+      GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+      // c[3, 32-47]
+      GELU_ERF_F32S_AVX512(zmm22, zmm0, zmm1, zmm2)
+
+      // c[4, 0-15]
+      GELU_ERF_F32S_AVX512(zmm24, zmm0, zmm1, zmm2)
+
+      // c[4, 16-31]
+      GELU_ERF_F32S_AVX512(zmm25, zmm0, zmm1, zmm2)
+
+      // c[4, 32-47]
+      GELU_ERF_F32S_AVX512(zmm26, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_5x48F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[1, 32-47]
+      CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      // c[2, 32-47]
+      CLIP_F32S_AVX512(zmm18, zmm0, zmm1)
+
+      // c[3, 0-15]
+      CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+      // c[3, 16-31]
+      CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+      // c[3, 32-47]
+      CLIP_F32S_AVX512(zmm22, zmm0, zmm1)
+
+      // c[4, 0-15]
+      CLIP_F32S_AVX512(zmm24, zmm0, zmm1)
+
+      // c[4, 16-31]
+      CLIP_F32S_AVX512(zmm25, zmm0, zmm1)
+
+      // c[4, 32-47]
+      CLIP_F32S_AVX512(zmm26, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_5x48F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    _mm512_storeu_ps(cbuf + 32, zmm14);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+    _mm512_storeu_ps(cbuf + 32, zmm18);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm20);
+    _mm512_storeu_ps(cbuf + 16, zmm21);
+    _mm512_storeu_ps(cbuf + 32, zmm22);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm24);
+    _mm512_storeu_ps(cbuf + 16, zmm25);
+    _mm512_storeu_ps(cbuf + 32, zmm26);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x48)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_4x48F_DISABLE,
+              &&POST_OPS_BIAS_4x48F,
+              &&POST_OPS_RELU_4x48F,
+              &&POST_OPS_RELU_SCALE_4x48F,
+              &&POST_OPS_GELU_TANH_4x48F,
+              &&POST_OPS_GELU_ERF_4x48F,
+              &&POST_OPS_CLIP_4x48F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6;
+    __m512 zmm8, zmm9, zmm10, zmm12, zmm13, zmm14;
+    __m512 zmm16, zmm17, zmm18, zmm20, zmm21, zmm22;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm12);
+    ZERO_ACC_ZMM_4_REG(zmm13, zmm14,zmm16, zmm17);
+    ZERO_ACC_ZMM_4_REG(zmm18, zmm20, zmm21, zmm22);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        zmm18 = _mm512_fmadd_ps(zmm6, zmm4, zmm18);
+        
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+        zmm22 = _mm512_fmadd_ps(zmm6, zmm5, zmm22);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm10,zmm12,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm13,zmm14,zmm16,zmm17,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm18,zmm20,zmm21,zmm22,zmm0)
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      zmm3 = _mm512_set1_ps(beta);
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf + 16);
+      zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+      zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf + 16);
+      zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+      zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf+16);
+      zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+      zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm18 = _mm512_fmadd_ps(zmm0, zmm3, zmm18);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf+16);
+      zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+      zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm22 = _mm512_fmadd_ps(zmm0, zmm3, zmm22);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_4x48F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm2, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_add_ps( zmm3, zmm22 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+        zmm4 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm4, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_add_ps( zmm4, zmm22 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_4x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[1,32-47]
+      zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      // c[2,32-47]
+      zmm18 = _mm512_max_ps( zmm1, zmm18 );
+
+      // c[3,0-15]
+      zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+      // c[3,16-31]
+      zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+      // c[3,32-47]
+      zmm22 = _mm512_max_ps( zmm1, zmm22 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_4x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[1, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      // c[2, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm18)
+
+      // c[3, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+      // c[3, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+      // c[3, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm22)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_4x48F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 32-47]
+      GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 32-47]
+      GELU_TANH_F32S_AVX512(zmm18, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 0-15]
+      GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 16-31]
+      GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 32-47]
+      GELU_TANH_F32S_AVX512(zmm22, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_4x48F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[1, 32-47]
+      GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      // c[2, 32-47]
+      GELU_ERF_F32S_AVX512(zmm18, zmm0, zmm1, zmm2)
+
+      // c[3, 0-15]
+      GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+      // c[3, 16-31]
+      GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+      // c[3, 32-47]
+      GELU_ERF_F32S_AVX512(zmm22, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_4x48F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[1, 32-47]
+      CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      // c[2, 32-47]
+      CLIP_F32S_AVX512(zmm18, zmm0, zmm1)
+
+      // c[3, 0-15]
+      CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+      // c[3, 16-31]
+      CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+      // c[3, 32-47]
+      CLIP_F32S_AVX512(zmm22, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_4x48F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    _mm512_storeu_ps(cbuf + 32, zmm14);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+    _mm512_storeu_ps(cbuf + 32, zmm18);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm20);
+    _mm512_storeu_ps(cbuf + 16, zmm21);
+    _mm512_storeu_ps(cbuf + 32, zmm22);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x48)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_3x48F_DISABLE,
+              &&POST_OPS_BIAS_3x48F,
+              &&POST_OPS_RELU_3x48F,
+              &&POST_OPS_RELU_SCALE_3x48F,
+              &&POST_OPS_GELU_TANH_3x48F,
+              &&POST_OPS_GELU_ERF_3x48F,
+              &&POST_OPS_CLIP_3x48F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6;
+    __m512 zmm8, zmm9, zmm10, zmm12, zmm13, zmm14;
+    __m512 zmm16, zmm17, zmm18;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm12);
+    ZERO_ACC_ZMM_4_REG(zmm13, zmm14,zmm16, zmm17);
+    zmm18 = _mm512_setzero_ps();
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        zmm18 = _mm512_fmadd_ps(zmm6, zmm4, zmm18);        
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm10,zmm12,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm13,zmm14,zmm16,zmm17,zmm0)
+    zmm18 = _mm512_mul_ps(zmm18, zmm0);
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      zmm3 = _mm512_set1_ps(beta);
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf + 16);
+      zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+      zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf + 16);
+      zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+      zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf+16);
+      zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+      zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm18 = _mm512_fmadd_ps(zmm0, zmm3, zmm18);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_3x48F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_add_ps( zmm3, zmm18 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_3x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[1,32-47]
+      zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      // c[2,32-47]
+      zmm18 = _mm512_max_ps( zmm1, zmm18 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_3x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[1, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      // c[2, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm18)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_3x48F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 32-47]
+      GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 32-47]
+      GELU_TANH_F32S_AVX512(zmm18, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_3x48F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[1, 32-47]
+      GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      // c[2, 32-47]
+      GELU_ERF_F32S_AVX512(zmm18, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_3x48F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[1, 32-47]
+      CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      // c[2, 32-47]
+      CLIP_F32S_AVX512(zmm18, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_3x48F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    _mm512_storeu_ps(cbuf + 32, zmm14);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+    _mm512_storeu_ps(cbuf + 32, zmm18);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x48)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_2x48F_DISABLE,
+              &&POST_OPS_BIAS_2x48F,
+              &&POST_OPS_RELU_2x48F,
+              &&POST_OPS_RELU_SCALE_2x48F,
+              &&POST_OPS_GELU_TANH_2x48F,
+              &&POST_OPS_GELU_ERF_2x48F,
+              &&POST_OPS_CLIP_2x48F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6;
+    __m512 zmm8, zmm9, zmm10, zmm12, zmm13, zmm14, zmm16,zmm17;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm12);
+    ZERO_ACC_ZMM_4_REG(zmm13, zmm14,zmm16, zmm17);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm10,zmm12,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm13,zmm14,zmm16,zmm17,zmm0)
+
+    if ( beta != 0.0 )
+    {
+      _cbuf = cbuf;
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      zmm3 = _mm512_set1_ps(beta);
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf + 16);
+      zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+      zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+      _cbuf += rs_c;
+
+      zmm0 = _mm512_load_ps(_cbuf);
+      zmm1 = _mm512_load_ps(_cbuf + 16);
+      zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+      zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+      zmm0 = _mm512_load_ps(_cbuf + 32);
+      zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_2x48F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm3, zmm14 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_add_ps( zmm2, zmm14 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_2x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[1,32-47]
+      zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_2x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[1, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_2x48F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 32-47]
+      GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_2x48F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[1, 32-47]
+      GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_2x48F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[1, 32-47]
+      CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_2x48F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    _mm512_storeu_ps(cbuf + 32, zmm14);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_1x48)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_1x48F_DISABLE,
+              &&POST_OPS_BIAS_1x48F,
+              &&POST_OPS_RELU_1x48F,
+              &&POST_OPS_RELU_SCALE_1x48F,
+              &&POST_OPS_GELU_TANH_1x48F,
+              &&POST_OPS_GELU_ERF_1x48F,
+              &&POST_OPS_CLIP_1x48F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6;
+    __m512 zmm8, zmm9, zmm10, zmm12;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm12);;
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm10,zmm12,zmm0)
+
+    if ( beta != 0.0 )
+    { 
+      //load c and multiply with beta and 
+      //add to accumulator and store back
+      zmm3 = _mm512_set1_ps(beta);
+
+      zmm0 = _mm512_load_ps(cbuf);
+      zmm1 = _mm512_load_ps(cbuf + 16);
+      zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+      zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+      zmm0 = _mm512_load_ps(cbuf + 32);
+      zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_1x48F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+        zmm3 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm3, zmm10 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_add_ps( zmm1, zmm10 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_1x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[0,32-47]
+      zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_1x48F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[0, 32-47]
+      RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_1x48F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 32-47]
+      GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_1x48F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[0, 32-47]
+      GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_1x48F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[0, 32-47]
+      CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_1x48F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    _mm512_storeu_ps(cbuf + 32, zmm10);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_5x32)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_5x32F_DISABLE,
+              &&POST_OPS_BIAS_5x32F,
+              &&POST_OPS_RELU_5x32F,
+              &&POST_OPS_RELU_SCALE_5x32F,
+              &&POST_OPS_GELU_TANH_5x32F,
+              &&POST_OPS_GELU_ERF_5x32F,
+              &&POST_OPS_CLIP_5x32F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5;
+    __m512 zmm8, zmm9, zmm12, zmm13;
+    __m512 zmm16, zmm17, zmm20, zmm21;
+    __m512 zmm24, zmm25, zmm28, zmm29;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm12, zmm13);
+    ZERO_ACC_ZMM_4_REG(zmm16, zmm17, zmm20, zmm21);
+    ZERO_ACC_ZMM_4_REG(zmm24, zmm25, zmm28, zmm29);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+       /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+        zmm2 = _mm512_set1_ps(*(abuf + 4*rs_a)); //broadcast c0r4
+        
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm2, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm2, zmm25);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+      
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm12,zmm13,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm16,zmm17,zmm20,zmm21,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm24,zmm25,zmm28,zmm29,zmm0)
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm3, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm3, zmm25);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_5x32F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm2, zmm21 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_add_ps( zmm1, zmm24 );
+
+        // c[4, 16-31]
+        zmm25 = _mm512_add_ps( zmm2, zmm25 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+        zmm4 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 ) );
+        zmm5 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 4 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm4, zmm21 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_add_ps( zmm5, zmm24 );
+
+        // c[4, 16-31]
+        zmm25 = _mm512_add_ps( zmm5, zmm25 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_5x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      // c[3,0-15]
+      zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+      // c[3,16-31]
+      zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+      // c[4,0-15]
+      zmm24 = _mm512_max_ps( zmm1, zmm24 );
+
+      // c[4,16-31]
+      zmm25 = _mm512_max_ps( zmm1, zmm25 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_5x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      // c[3, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+      // c[3, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+      // c[4, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm24)
+
+      // c[4, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm25)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_5x32F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 0-15]
+      GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 16-31]
+      GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 0-15]
+      GELU_TANH_F32S_AVX512(zmm24, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[4, 16-31]
+      GELU_TANH_F32S_AVX512(zmm25, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_5x32F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      // c[3, 0-15]
+      GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+      // c[3, 16-31]
+      GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+      // c[4, 0-15]
+      GELU_ERF_F32S_AVX512(zmm24, zmm0, zmm1, zmm2)
+
+      // c[4, 16-31]
+      GELU_ERF_F32S_AVX512(zmm25, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_5x32F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      // c[3, 0-15]
+      CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+      // c[3, 16-31]
+      CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+      // c[4, 0-15]
+      CLIP_F32S_AVX512(zmm24, zmm0, zmm1)
+
+      // c[4, 16-31]
+      CLIP_F32S_AVX512(zmm25, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_5x32F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm20);
+    _mm512_storeu_ps(cbuf + 16, zmm21);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm24);
+    _mm512_storeu_ps(cbuf + 16, zmm25);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_4x32)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_4x32F_DISABLE,
+              &&POST_OPS_BIAS_4x32F,
+              &&POST_OPS_RELU_4x32F,
+              &&POST_OPS_RELU_SCALE_4x32F,
+              &&POST_OPS_GELU_TANH_4x32F,
+              &&POST_OPS_GELU_ERF_4x32F,
+              &&POST_OPS_CLIP_4x32F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5;
+    __m512 zmm8, zmm9, zmm12, zmm13;
+    __m512 zmm16, zmm17, zmm20, zmm21;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm12, zmm13);
+    ZERO_ACC_ZMM_4_REG(zmm16, zmm17, zmm20, zmm21);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+       /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+    
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+      
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm12,zmm13,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm16,zmm17,zmm20,zmm21,zmm0)
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_4x32F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm2, zmm21 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+        zmm4 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 3 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+        // c[3, 16-31]
+        zmm21 = _mm512_add_ps( zmm4, zmm21 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_4x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      // c[3,0-15]
+      zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+      // c[3,16-31]
+      zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_4x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      // c[3, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+      // c[3, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_4x32F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 0-15]
+      GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[3, 16-31]
+      GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_4x32F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      // c[3, 0-15]
+      GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+      // c[3, 16-31]
+      GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_4x32F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      // c[3, 0-15]
+      CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+      // c[3, 16-31]
+      CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_4x32F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm20);
+    _mm512_storeu_ps(cbuf + 16, zmm21);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_3x32)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_3x32F_DISABLE,
+              &&POST_OPS_BIAS_3x32F,
+              &&POST_OPS_RELU_3x32F,
+              &&POST_OPS_RELU_SCALE_3x32F,
+              &&POST_OPS_GELU_TANH_3x32F,
+              &&POST_OPS_GELU_ERF_3x32F,
+              &&POST_OPS_CLIP_3x32F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5;
+    __m512 zmm8, zmm9, zmm12, zmm13;
+    __m512 zmm16, zmm17, zmm20, zmm21;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm12, zmm13);
+    ZERO_ACC_ZMM_4_REG(zmm16, zmm17, zmm20, zmm21);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+       /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+    
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+      
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm12,zmm13,zmm0)
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm16,zmm17,zmm20,zmm21,zmm0)
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf+16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+        _cbuf += rs_c;
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_3x32F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm2, zmm17 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+        zmm3 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 2 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+        // c[2, 16-31]
+        zmm17 = _mm512_add_ps( zmm3, zmm17 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_3x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      // c[2,0-15]
+      zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+      // c[2,16-31]
+      zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_3x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      // c[2, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+      // c[2, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_3x32F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 0-15]
+      GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[2, 16-31]
+      GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_3x32F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      // c[2, 0-15]
+      GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+      // c[2, 16-31]
+      GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_3x32F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      // c[2, 0-15]
+      CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+      // c[2, 16-31]
+      CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_3x32F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm16);
+    _mm512_storeu_ps(cbuf + 16, zmm17);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_2x32)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_2x32F_DISABLE,
+              &&POST_OPS_BIAS_2x32F,
+              &&POST_OPS_RELU_2x32F,
+              &&POST_OPS_RELU_SCALE_2x32F,
+              &&POST_OPS_GELU_TANH_2x32F,
+              &&POST_OPS_GELU_ERF_2x32F,
+              &&POST_OPS_CLIP_2x32F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5;
+    __m512 zmm8, zmm9, zmm12, zmm13;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm12, zmm13);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+    float *_cbuf = NULL;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+       /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+    
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+      
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm12,zmm13,zmm0)
+
+    if ( beta != 0.0 )
+    {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_2x32F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 1 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+        // c[1, 16-31]
+        zmm13 = _mm512_add_ps( zmm2, zmm13 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_2x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      // c[1,0-15]
+      zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+      // c[1,16-31]
+      zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_2x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      // c[1, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+      // c[1, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_2x32F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 0-15]
+      GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[1, 16-31]
+      GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_2x32F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      // c[1, 0-15]
+      GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+      // c[1, 16-31]
+      GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_2x32F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      // c[1, 0-15]
+      CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+      // c[1, 16-31]
+      CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_2x32F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+    cbuf += rs_c;
+    _mm512_storeu_ps(cbuf, zmm12);
+    _mm512_storeu_ps(cbuf + 16, zmm13);
+}
+
+LPGEMM_M_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_1x32)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_1x32F_DISABLE,
+              &&POST_OPS_BIAS_1x32F,
+              &&POST_OPS_RELU_1x32F,
+              &&POST_OPS_RELU_SCALE_1x32F,
+              &&POST_OPS_GELU_TANH_1x32F,
+              &&POST_OPS_GELU_ERF_1x32F,
+              &&POST_OPS_CLIP_1x32F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5;
+    __m512 zmm8, zmm9, zmm12, zmm13;
+
+    /* zero the accumulator registers */
+    ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm12, zmm13);
+
+    float *abuf = (float *)a;
+    float *bbuf = (float *)b;
+    float *cbuf = (float *)c;
+
+    for(dim_t k = 0; k < k_iter; k++)
+    {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+       /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0 
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+    
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+    }//kloop
+
+    zmm0 = _mm512_set1_ps(alpha);
+      
+    ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm12,zmm13,zmm0)
+
+    if ( beta != 0.0 )
+    {
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(cbuf);
+        zmm1 = _mm512_load_ps(cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+    }
+
+    // Post Ops
+    lpgemm_post_op* post_ops_list_temp = post_ops_list;
+    POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_1x32F:
+    {
+      if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+           ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+      {
+        zmm1 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+        zmm2 =
+          _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm2, zmm9 );
+      }
+      else
+      {
+        // If original output was columns major, then by the time
+        // kernel sees it, the matrix would be accessed as if it were
+        // transposed. Due to this the bias array will be accessed by
+        // the ic index, and each bias element corresponds to an
+        // entire row of the transposed output array, instead of an
+        // entire column.
+        zmm1 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+            post_ops_attr.post_op_c_i + 0 ) );
+
+        // c[0,0-15]
+        zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_add_ps( zmm1, zmm9 );
+      }
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_1x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+
+      // c[0,0-15]
+      zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+      // c[0, 16-31]
+      zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_RELU_SCALE_1x32F:
+    {
+      zmm1 = _mm512_setzero_ps();
+      zmm2 =
+        _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+      __mmask16 relu_cmp_mask;
+
+      // c[0, 0-15]
+      RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+      // c[0, 16-31]
+      RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_TANH_1x32F:
+    {
+      __m512i zmm6;
+      // c[0, 0-15]
+      GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      // c[0, 16-31]
+      GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_GELU_ERF_1x32F:
+    {
+      // c[0, 0-15]
+      GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+      // c[0, 16-31]
+      GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_CLIP_1x32F:
+    {
+      zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+      zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+      // c[0, 0-15]
+      CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+      // c[0, 16-31]
+      CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+      POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+    }
+POST_OPS_1x32F_DISABLE:
+    ;
+
+    _mm512_storeu_ps(cbuf, zmm8); 
+    _mm512_storeu_ps(cbuf + 16, zmm9);
+}
+#endif
diff --git a/kernels/zen4/lpgemm/f32f32f32/lpgemm_kernel_macros_f32.h b/kernels/zen4/lpgemm/f32f32f32/lpgemm_kernel_macros_f32.h
new file mode 100644
index 0000000000..f24bca9e1f
--- /dev/null
+++ b/kernels/zen4/lpgemm/f32f32f32/lpgemm_kernel_macros_f32.h
@@ -0,0 +1,78 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+  Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_F32_SGEMM_KERN_MACROS_H
+#define LPGEMM_F32_SGEMM_KERN_MACROS_H
+
+#include "../gelu_avx512.h"
+#include "../math_utils_avx512.h"
+
+/* ReLU scale (Parametric ReLU):  f(x) = x, when x > 0 and f(x) = a*x when x <= 0 */
+#define RELU_SCALE_OP_F32S_AVX512(reg) \
+	/* Generate indenx of elements <= 0.*/ \
+	relu_cmp_mask = _mm512_cmple_ps_mask( reg, zmm1 ); \
+ \
+	/* Apply scaling on for <= 0 elements.*/ \
+	reg = _mm512_mask_mul_ps( reg, relu_cmp_mask, reg, zmm2 ); \
+
+/* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */
+#define GELU_TANH_F32S_AVX512(reg, r, r2, x, z, dn, x_tanh, q) \
+\
+	GELU_TANH_F32_AVX512_DEF(reg, r, r2, x, z, dn, x_tanh, q); \
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_F32S_AVX512(reg, r, x, x_erf) \
+\
+	GELU_ERF_F32_AVX512_DEF(reg, r, x, x_erf); \
+
+#define CLIP_F32S_AVX512(reg, min, max) \
+\
+	reg = _mm512_min_ps( _mm512_max_ps( reg, min ), max ); \
+
+//Zero-out the given ZMM accumulator registers
+#define ZERO_ACC_ZMM_4_REG(zmm0,zmm1,zmm2,zmm3) \
+      zmm0 = _mm512_setzero_ps(); \
+      zmm1 = _mm512_setzero_ps(); \
+      zmm2 = _mm512_setzero_ps(); \
+      zmm3 = _mm512_setzero_ps();
+
+/*Multiply alpha with accumulator registers and store back*/
+#define ALPHA_MUL_ACC_ZMM_4_REG(zmm0,zmm1,zmm2,zmm3,alpha) \
+      zmm0 = _mm512_mul_ps(zmm0,alpha); \
+      zmm1 = _mm512_mul_ps(zmm1,alpha); \
+      zmm2 = _mm512_mul_ps(zmm2,alpha); \
+      zmm3 = _mm512_mul_ps(zmm3,alpha);
+ 
+#endif //LPGEMM_F32_SGEMM_KERN_MACROS_H
+
diff --git a/kernels/zen4/lpgemm/f32f32f32/lpgemm_m_kernel_f32_avx512.c b/kernels/zen4/lpgemm/f32f32f32/lpgemm_m_kernel_f32_avx512.c
new file mode 100644
index 0000000000..d1d14209ba
--- /dev/null
+++ b/kernels/zen4/lpgemm/f32f32f32/lpgemm_m_kernel_f32_avx512.c
@@ -0,0 +1,2233 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+  Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#include "immintrin.h"
+#include "xmmintrin.h"
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_kernel_macros_f32.h"
+
+#define MR 6
+#define NR 64
+
+LPGEMM_MAIN_KERN(float,float,float,f32f32f32of32_avx512_6x64m)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_6x64F_DISABLE,
+              &&POST_OPS_BIAS_6x64F,
+              &&POST_OPS_RELU_6x64F,
+              &&POST_OPS_RELU_SCALE_6x64F,
+              &&POST_OPS_GELU_TANH_6x64F,
+              &&POST_OPS_GELU_ERF_6x64F,
+              &&POST_OPS_CLIP_6x64F
+            };
+    uint64_t n_left = n0 % 64;  //n0 is expected to be n0<=NR
+
+    // First check whether this is a edge case in the n dimension.
+    // If so, dispatch other 12x?m kernels, as needed.
+    if ( n_left )
+    {
+        float*  cij = (float* )c;
+        float*  bj  = (float* )b;
+        float*  ai  = (float* )a;
+
+        if ( 48 <= n_left )
+        {
+            const dim_t nr_cur = 48;
+
+            lpgemm_rowvar_f32f32f32of32_avx512_6x48m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 48;
+        }
+
+        if ( 32 <= n_left )
+        {
+            const dim_t nr_cur = 32;
+
+            lpgemm_rowvar_f32f32f32of32_avx512_6x32m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 32;
+        }
+
+        if ( 16 <= n_left )
+        {
+            const dim_t nr_cur = 16;
+
+            lpgemm_rowvar_f32f32f32of32_6x16m
+            (
+              m0, nr_cur, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c, cs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 16;
+        }
+
+        if ( 8 <= n_left )
+        {
+            const dim_t nr_cur = 8;
+
+            lpgemm_rowvar_f32f32f32of32_6x8m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 8;
+        }
+  
+        if ( 4 <= n_left )
+        {
+            const dim_t nr_cur = 4;
+
+            lpgemm_rowvar_f32f32f32of32_6x4m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 4;
+        }
+
+        if ( 2 <= n_left )
+        {
+            const dim_t nr_cur = 2;
+  
+            lpgemm_rowvar_f32f32f32of32_6x2m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+            cij += nr_cur*cs_c; bj += nr_cur*cs_b; n_left -= nr_cur;
+			post_ops_attr.post_op_c_j += 2;
+        }
+
+        if ( 1 == n_left )
+        {
+            lpgemm_rowvar_f32f32f32of32_6x1m
+            (
+              m0, k0,
+              ai,  rs_a, cs_a, ps_a,
+              bj,  rs_b, cs_b,
+              cij, rs_c,
+              alpha, beta,
+              post_ops_list, post_ops_attr
+            );
+        }
+
+        return;
+    }
+
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    // Query the panel stride of A and convert it to units of bytes.
+    if ( m_iter == 0 ){    goto consider_edge_cases; }
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
+    __m512 zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
+    __m512 zmm16, zmm17, zmm18, zmm19, zmm20, zmm21, zmm22, zmm23;
+    __m512 zmm24, zmm25, zmm26, zmm27, zmm28, zmm29, zmm30, zmm31;
+
+    /*Produce MRxNR outputs */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+      /* zero the accumulator registers */
+      ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm11);
+      ZERO_ACC_ZMM_4_REG(zmm12, zmm13, zmm14, zmm15);
+      ZERO_ACC_ZMM_4_REG(zmm16, zmm17, zmm18, zmm19);
+      ZERO_ACC_ZMM_4_REG(zmm20, zmm21, zmm22, zmm23);
+      ZERO_ACC_ZMM_4_REG(zmm24, zmm25, zmm26, zmm27);
+      ZERO_ACC_ZMM_4_REG(zmm28, zmm29, zmm30, zmm31);
+
+      _mm256_zeroupper();
+
+      float *abuf, *bbuf, *cbuf, *_cbuf;
+
+      abuf = (float *)a + m * ps_a; // Move to next MRxKC in MCxKC (where MC>=MR)
+      bbuf = (float *)b;  //Same KCxNR is used across different MRxKC in MCxKC
+      cbuf = (float *)c + m * MR * rs_c; // Move to next MRXNR in output
+      
+      /*_mm_prefetch( (MR X NR) from C*/
+      _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 5*rs_c), _MM_HINT_T0);
+
+      for(dim_t k = 0; k < k_iter; k++)
+      {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        zmm7 = _mm512_loadu_ps (bbuf + 48); //load 48-63 from current row
+
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm7, zmm2, zmm11);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm7, zmm3, zmm15);
+
+        zmm2 = _mm512_set1_ps(*(abuf + 4*rs_a)); //broadcast c0r4
+        zmm3 = _mm512_set1_ps(*(abuf + 5*rs_a)); //broadcast c0r5
+
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        zmm18 = _mm512_fmadd_ps(zmm6, zmm4, zmm18);
+        zmm19 = _mm512_fmadd_ps(zmm7, zmm4, zmm19);
+
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+        zmm22 = _mm512_fmadd_ps(zmm6, zmm5, zmm22);
+        zmm23 = _mm512_fmadd_ps(zmm7, zmm5, zmm23);
+
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm2, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm2, zmm25);
+        zmm26 = _mm512_fmadd_ps(zmm6, zmm2, zmm26);
+        zmm27 = _mm512_fmadd_ps(zmm7, zmm2, zmm27);
+
+        zmm28 = _mm512_fmadd_ps(zmm0, zmm3, zmm28);
+        zmm29 = _mm512_fmadd_ps(zmm1, zmm3, zmm29);
+        zmm30 = _mm512_fmadd_ps(zmm6, zmm3, zmm30);
+        zmm31 = _mm512_fmadd_ps(zmm7, zmm3, zmm31);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+      }//kloop
+
+      zmm0 = _mm512_set1_ps(alpha);
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm10,zmm11,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm12,zmm13,zmm14,zmm15,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm16,zmm17,zmm18,zmm19,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm20,zmm21,zmm22,zmm23,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm24,zmm25,zmm26,zmm27,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm28,zmm29,zmm30,zmm31,zmm0)
+
+      if ( beta != 0 )
+      {
+        _cbuf = cbuf;
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);        
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+        zmm11 = _mm512_fmadd_ps(zmm1, zmm3, zmm11);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+        zmm15 = _mm512_fmadd_ps(zmm1, zmm3, zmm15);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm18 = _mm512_fmadd_ps(zmm0, zmm3, zmm18);
+        zmm19 = _mm512_fmadd_ps(zmm1, zmm3, zmm19);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm22 = _mm512_fmadd_ps(zmm0, zmm3, zmm22);
+        zmm23 = _mm512_fmadd_ps(zmm1, zmm3, zmm23);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm3, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm3, zmm25);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm26 = _mm512_fmadd_ps(zmm0, zmm3, zmm26);
+        zmm27 = _mm512_fmadd_ps(zmm1, zmm3, zmm27);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm28 = _mm512_fmadd_ps(zmm0, zmm3, zmm28);
+        zmm29 = _mm512_fmadd_ps(zmm1, zmm3, zmm29);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm1 = _mm512_load_ps(_cbuf + 48);
+        zmm30 = _mm512_fmadd_ps(zmm0, zmm3, zmm30);
+        zmm31 = _mm512_fmadd_ps(zmm1, zmm3, zmm31);
+      }
+
+      // Post Ops
+      lpgemm_post_op* post_ops_list_temp = post_ops_list;
+      POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_6x64F:
+      {
+        if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+             ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+        {
+          zmm1 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+          zmm2 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+          zmm3 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+          zmm4 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+          // c[0,0-15]
+          zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+          // c[0, 16-31]
+          zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+          // c[0,32-47]
+          zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+          // c[0,48-63]
+          zmm11 = _mm512_add_ps( zmm4, zmm11 );
+
+          // c[1,0-15]
+          zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+          // c[1, 16-31]
+          zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+          // c[1,32-47]
+          zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+          // c[1,48-63]
+          zmm15 = _mm512_add_ps( zmm4, zmm15 );
+
+          // c[2,0-15]
+          zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+          // c[2, 16-31]
+          zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+          // c[2,32-47]
+          zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+          // c[2,48-63]
+          zmm19 = _mm512_add_ps( zmm4, zmm19 );
+
+          // c[3,0-15]
+          zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+          // c[3, 16-31]
+          zmm21 = _mm512_add_ps( zmm2, zmm21 );
+
+          // c[3,32-47]
+          zmm22 = _mm512_add_ps( zmm3, zmm22 );
+
+          // c[3,48-63]
+          zmm23 = _mm512_add_ps( zmm4, zmm23 );
+
+          // c[4,0-15]
+          zmm24 = _mm512_add_ps( zmm1, zmm24 );
+
+          // c[4, 16-31]
+          zmm25 = _mm512_add_ps( zmm2, zmm25 );
+
+          // c[4,32-47]
+          zmm26 = _mm512_add_ps( zmm3, zmm26 );
+
+          // c[4,48-63]
+          zmm27 = _mm512_add_ps( zmm4, zmm27 );
+
+          // c[5,0-15]
+          zmm28 = _mm512_add_ps( zmm1, zmm28 );
+
+          // c[5, 16-31]
+          zmm29 = _mm512_add_ps( zmm2, zmm29 );
+
+          // c[5,32-47]
+          zmm30 = _mm512_add_ps( zmm3, zmm30 );
+
+          // c[5,48-63]
+          zmm31 = _mm512_add_ps( zmm4, zmm31 );
+        }
+        else
+        {
+          // If original output was columns major, then by the time
+          // kernel sees it, the matrix would be accessed as if it were
+          // transposed. Due to this the bias array will be accessed by
+          // the ic index, and each bias element corresponds to an
+          // entire row of the transposed output array, instead of an
+          // entire column.
+          zmm1 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 0 ) );
+          zmm2 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 1 ) );
+          zmm3 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 2 ) );
+          zmm4 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 3 ) );
+          zmm5 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 4 ) );
+          zmm6 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 5 ) );
+
+          // c[0,0-15]
+          zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+          // c[0, 16-31]
+          zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+          // c[0,32-47]
+          zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+          // c[0,48-63]
+          zmm11 = _mm512_add_ps( zmm1, zmm11 );
+
+          // c[1,0-15]
+          zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+          // c[1, 16-31]
+          zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+          // c[1,32-47]
+          zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+          // c[1,48-63]
+          zmm15 = _mm512_add_ps( zmm2, zmm15 );
+
+          // c[2,0-15]
+          zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+          // c[2, 16-31]
+          zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+          // c[2,32-47]
+          zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+          // c[2,48-63]
+          zmm19 = _mm512_add_ps( zmm3, zmm19 );
+
+          // c[3,0-15]
+          zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+          // c[3, 16-31]
+          zmm21 = _mm512_add_ps( zmm4, zmm21 );
+
+          // c[3,32-47]
+          zmm22 = _mm512_add_ps( zmm4, zmm22 );
+
+          // c[3,48-63]
+          zmm23 = _mm512_add_ps( zmm4, zmm23 );
+
+          // c[4,0-15]
+          zmm24 = _mm512_add_ps( zmm5, zmm24 );
+
+          // c[4, 16-31]
+          zmm25 = _mm512_add_ps( zmm5, zmm25 );
+
+          // c[4,32-47]
+          zmm26 = _mm512_add_ps( zmm5, zmm26 );
+
+          // c[4,48-63]
+          zmm27 = _mm512_add_ps( zmm5, zmm27 );
+
+          // c[5,0-15]
+          zmm28 = _mm512_add_ps( zmm6, zmm28 );
+
+          // c[5, 16-31]
+          zmm29 = _mm512_add_ps( zmm6, zmm29 );
+
+          // c[5,32-47]
+          zmm30 = _mm512_add_ps( zmm6, zmm30 );
+
+          // c[5,48-63]
+          zmm31 = _mm512_add_ps( zmm6, zmm31 );
+        }
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_6x64F:
+      {
+        zmm1 = _mm512_setzero_ps();
+
+        // c[0,0-15]
+        zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+        // c[0,48-63]
+        zmm11 = _mm512_max_ps( zmm1, zmm11 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+        // c[1,16-31]
+        zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+        // c[1,48-63]
+        zmm15 = _mm512_max_ps( zmm1, zmm15 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+        // c[2,16-31]
+        zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_max_ps( zmm1, zmm18 );
+
+        // c[2,48-63]
+        zmm19 = _mm512_max_ps( zmm1, zmm19 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+        // c[3,16-31]
+        zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_max_ps( zmm1, zmm22 );
+
+        // c[3,48-63]
+        zmm23 = _mm512_max_ps( zmm1, zmm23 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_max_ps( zmm1, zmm24 );
+
+        // c[4,16-31]
+        zmm25 = _mm512_max_ps( zmm1, zmm25 );
+
+        // c[4,32-47]
+        zmm26 = _mm512_max_ps( zmm1, zmm26 );
+
+        // c[4,48-63]
+        zmm27 = _mm512_max_ps( zmm1, zmm27 );
+
+        // c[5,0-15]
+        zmm28 = _mm512_max_ps( zmm1, zmm28 );
+
+        // c[5,16-31]
+        zmm29 = _mm512_max_ps( zmm1, zmm29 );
+
+        // c[5,32-47]
+        zmm30 = _mm512_max_ps( zmm1, zmm30 );
+
+        // c[5,48-63]
+        zmm31 = _mm512_max_ps( zmm1, zmm31 );
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_SCALE_6x64F:
+      {
+        zmm1 = _mm512_setzero_ps();
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+        __mmask16 relu_cmp_mask;
+
+        // c[0, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+        // c[0, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+        // c[0, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+        // c[0, 48-63]
+        RELU_SCALE_OP_F32S_AVX512(zmm11)
+
+        // c[1, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+        // c[1, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+        // c[1, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+        // c[1, 48-63]
+        RELU_SCALE_OP_F32S_AVX512(zmm15)
+
+        // c[2, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+        // c[2, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+        // c[2, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm18)
+
+        // c[2, 48-63]
+        RELU_SCALE_OP_F32S_AVX512(zmm19)
+
+        // c[3, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+        // c[3, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+        // c[3, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm22)
+
+        // c[3, 48-63]
+        RELU_SCALE_OP_F32S_AVX512(zmm23)
+
+        // c[4, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm24)
+
+        // c[4, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm25)
+
+        // c[4, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm26)
+
+        // c[4, 48-63]
+        RELU_SCALE_OP_F32S_AVX512(zmm27)
+
+        // c[5, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm28)
+
+        // c[5, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm29)
+
+        // c[5, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm30)
+
+        // c[5, 48-63]
+        RELU_SCALE_OP_F32S_AVX512(zmm31)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_TANH_6x64F:
+      {
+        __m512i zmm6;
+        // c[0, 0-15]
+        GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[0, 16-31]
+        GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[0, 32-47]
+        GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[0, 48-63]
+        GELU_TANH_F32S_AVX512(zmm11, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 0-15]
+        GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 16-31]
+        GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 32-47]
+        GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 48-63]
+        GELU_TANH_F32S_AVX512(zmm15, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 0-15]
+        GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 16-31]
+        GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 32-47]
+        GELU_TANH_F32S_AVX512(zmm18, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 48-63]
+        GELU_TANH_F32S_AVX512(zmm19, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 0-15]
+        GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 16-31]
+        GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 32-47]
+        GELU_TANH_F32S_AVX512(zmm22, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 48-63]
+        GELU_TANH_F32S_AVX512(zmm23, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 0-15]
+        GELU_TANH_F32S_AVX512(zmm24, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 16-31]
+        GELU_TANH_F32S_AVX512(zmm25, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 32-47]
+        GELU_TANH_F32S_AVX512(zmm26, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 48-63]
+        GELU_TANH_F32S_AVX512(zmm27, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 0-15]
+        GELU_TANH_F32S_AVX512(zmm28, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 16-31]
+        GELU_TANH_F32S_AVX512(zmm29, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 32-47]
+        GELU_TANH_F32S_AVX512(zmm30, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 48-63]
+        GELU_TANH_F32S_AVX512(zmm31, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_ERF_6x64F:
+      {
+        // c[0, 0-15]
+        GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+        // c[0, 16-31]
+        GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+        // c[0, 32-47]
+        GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+        // c[0, 48-63]
+        GELU_ERF_F32S_AVX512(zmm11, zmm0, zmm1, zmm2)
+
+        // c[1, 0-15]
+        GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+        // c[1, 16-31]
+        GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+        // c[1, 32-47]
+        GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+        // c[1, 48-63]
+        GELU_ERF_F32S_AVX512(zmm15, zmm0, zmm1, zmm2)
+
+        // c[2, 0-15]
+        GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+        // c[2, 16-31]
+        GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+        // c[2, 32-47]
+        GELU_ERF_F32S_AVX512(zmm18, zmm0, zmm1, zmm2)
+
+        // c[2, 48-63]
+        GELU_ERF_F32S_AVX512(zmm19, zmm0, zmm1, zmm2)
+
+        // c[3, 0-15]
+        GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+        // c[3, 16-31]
+        GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+        // c[3, 32-47]
+        GELU_ERF_F32S_AVX512(zmm22, zmm0, zmm1, zmm2)
+
+        // c[3, 48-63]
+        GELU_ERF_F32S_AVX512(zmm23, zmm0, zmm1, zmm2)
+
+        // c[4, 0-15]
+        GELU_ERF_F32S_AVX512(zmm24, zmm0, zmm1, zmm2)
+
+        // c[4, 16-31]
+        GELU_ERF_F32S_AVX512(zmm25, zmm0, zmm1, zmm2)
+
+        // c[4, 32-47]
+        GELU_ERF_F32S_AVX512(zmm26, zmm0, zmm1, zmm2)
+
+        // c[4, 48-63]
+        GELU_ERF_F32S_AVX512(zmm27, zmm0, zmm1, zmm2)
+
+        // c[5, 0-15]
+        GELU_ERF_F32S_AVX512(zmm28, zmm0, zmm1, zmm2)
+
+        // c[5, 16-31]
+        GELU_ERF_F32S_AVX512(zmm29, zmm0, zmm1, zmm2)
+
+        // c[5, 32-47]
+        GELU_ERF_F32S_AVX512(zmm30, zmm0, zmm1, zmm2)
+
+        // c[5, 48-63]
+        GELU_ERF_F32S_AVX512(zmm31, zmm0, zmm1, zmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_CLIP_6x64F:
+      {
+        zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+        zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+        // c[0, 0-15]
+        CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+        // c[0, 16-31]
+        CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+        // c[0, 32-47]
+        CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+        // c[0, 48-63]
+        CLIP_F32S_AVX512(zmm11, zmm0, zmm1)
+
+        // c[1, 0-15]
+        CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+        // c[1, 16-31]
+        CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+        // c[1, 32-47]
+        CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+        // c[1, 48-63]
+        CLIP_F32S_AVX512(zmm15, zmm0, zmm1)
+
+        // c[2, 0-15]
+        CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+        // c[2, 16-31]
+        CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+        // c[2, 32-47]
+        CLIP_F32S_AVX512(zmm18, zmm0, zmm1)
+
+        // c[2, 48-63]
+        CLIP_F32S_AVX512(zmm19, zmm0, zmm1)
+
+        // c[3, 0-15]
+        CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+        // c[3, 16-31]
+        CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+        // c[3, 32-47]
+        CLIP_F32S_AVX512(zmm22, zmm0, zmm1)
+
+        // c[3, 48-63]
+        CLIP_F32S_AVX512(zmm23, zmm0, zmm1)
+
+        // c[4, 0-15]
+        CLIP_F32S_AVX512(zmm24, zmm0, zmm1)
+
+        // c[4, 16-31]
+        CLIP_F32S_AVX512(zmm25, zmm0, zmm1)
+
+        // c[4, 32-47]
+        CLIP_F32S_AVX512(zmm26, zmm0, zmm1)
+
+        // c[4, 48-63]
+        CLIP_F32S_AVX512(zmm27, zmm0, zmm1)
+
+        // c[5, 0-15]
+        CLIP_F32S_AVX512(zmm28, zmm0, zmm1)
+
+        // c[5, 16-31]
+        CLIP_F32S_AVX512(zmm29, zmm0, zmm1)
+
+        // c[5, 32-47]
+        CLIP_F32S_AVX512(zmm30, zmm0, zmm1)
+
+        // c[5, 48-63]
+        CLIP_F32S_AVX512(zmm31, zmm0, zmm1)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_6x64F_DISABLE:
+      ;
+
+      _mm512_storeu_ps(cbuf, zmm8); 
+      _mm512_storeu_ps(cbuf + 16, zmm9);
+      _mm512_storeu_ps(cbuf + 32, zmm10);
+      _mm512_storeu_ps(cbuf + 48, zmm11);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm12);
+      _mm512_storeu_ps(cbuf + 16, zmm13);
+      _mm512_storeu_ps(cbuf + 32, zmm14);
+      _mm512_storeu_ps(cbuf + 48, zmm15);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm16);
+      _mm512_storeu_ps(cbuf + 16, zmm17);
+      _mm512_storeu_ps(cbuf + 32, zmm18);
+      _mm512_storeu_ps(cbuf + 48, zmm19);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm20);
+      _mm512_storeu_ps(cbuf + 16, zmm21);
+      _mm512_storeu_ps(cbuf + 32, zmm22);
+      _mm512_storeu_ps(cbuf + 48, zmm23);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm24);
+      _mm512_storeu_ps(cbuf + 16, zmm25);
+      _mm512_storeu_ps(cbuf + 32, zmm26);
+      _mm512_storeu_ps(cbuf + 48, zmm27);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm28);
+      _mm512_storeu_ps(cbuf + 16, zmm29);
+      _mm512_storeu_ps(cbuf + 32, zmm30);
+      _mm512_storeu_ps(cbuf + 48, zmm31);
+
+      post_ops_attr.post_op_c_i += MR;
+    }//mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if( m_left )
+    {
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float*  restrict cij = (float *)c + i_edge*rs_c;
+        float*  restrict ai  = (float *)a + m_iter*ps_a;
+        float*  restrict bj  = (float *)b;
+
+        lpgemm_m_fringe_f32_ker_ft ker_fps[6] =
+        {
+          NULL,
+          lpgemm_rowvar_f32f32f32of32_avx512_1x64,
+          lpgemm_rowvar_f32f32f32of32_avx512_2x64,
+          lpgemm_rowvar_f32f32f32of32_avx512_3x64,
+          lpgemm_rowvar_f32f32f32of32_avx512_4x64,
+          lpgemm_rowvar_f32f32f32of32_avx512_5x64
+        };
+
+        lpgemm_m_fringe_f32_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          k0,
+          ai, rs_a, cs_a,
+          bj, rs_b, cs_b,
+          cij,rs_c,
+          alpha, beta,
+          post_ops_list, post_ops_attr
+        );
+        return;
+    }
+}
+
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_6x48m)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_6x48F_DISABLE,
+              &&POST_OPS_BIAS_6x48F,
+              &&POST_OPS_RELU_6x48F,
+              &&POST_OPS_RELU_SCALE_6x48F,
+              &&POST_OPS_GELU_TANH_6x48F,
+              &&POST_OPS_GELU_ERF_6x48F,
+              &&POST_OPS_CLIP_6x48F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    // Query the panel stride of A and convert it to units of bytes.
+    if ( m_iter == 0 ){    goto consider_edge_cases; }
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6;
+    __m512 zmm8, zmm9, zmm10, zmm12, zmm13, zmm14;
+    __m512 zmm16, zmm17, zmm18, zmm20, zmm21, zmm22;
+    __m512 zmm24, zmm25, zmm26, zmm28, zmm29, zmm30, zmm31;
+
+    
+    /*Produce MRxNR outputs */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+      /* zero the accumulator registers */
+      ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm10, zmm12);
+      ZERO_ACC_ZMM_4_REG(zmm13, zmm14,zmm16, zmm17);
+      ZERO_ACC_ZMM_4_REG(zmm18, zmm20, zmm21, zmm22);
+      ZERO_ACC_ZMM_4_REG(zmm24, zmm25, zmm26, zmm28);
+      ZERO_ACC_ZMM_4_REG(zmm29, zmm30, zmm31, zmm2);
+
+      _mm256_zeroupper();
+
+      float *abuf, *bbuf, *cbuf, *_cbuf;
+
+      abuf = (float *)a + m * ps_a; // Move to next MRxKC in MCxKC (where MC>=MR)
+      bbuf = (float *)b;  //Same KCxNR is used across different MRxKC in MCxKC
+      cbuf = (float *)c + m * MR * rs_c; // Move to next MRXNR in output
+
+      /*_mm_prefetch( (MR X NR) from C*/
+      _mm_prefetch((cbuf + 0*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 1*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 2*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 3*rs_c), _MM_HINT_T0);
+      _mm_prefetch((cbuf + 4*rs_c), _MM_HINT_T0);
+
+      for(dim_t k = 0; k < k_iter; k++)
+      {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+        /*Load Next 32 elements from row0 of B*/
+        zmm6 = _mm512_loadu_ps (bbuf + 32); //load 32-47 from current row 
+        
+        /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+        zmm10 = _mm512_fmadd_ps(zmm6, zmm2, zmm10);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        zmm14 = _mm512_fmadd_ps(zmm6, zmm3, zmm14);
+
+        zmm2 = _mm512_set1_ps(*(abuf + 4*rs_a)); //broadcast c0r4
+        zmm3 = _mm512_set1_ps(*(abuf + 5*rs_a)); //broadcast c0r5
+        
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        zmm18 = _mm512_fmadd_ps(zmm6, zmm4, zmm18);
+        
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+        zmm22 = _mm512_fmadd_ps(zmm6, zmm5, zmm22);
+
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm2, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm2, zmm25);
+        zmm26 = _mm512_fmadd_ps(zmm6, zmm2, zmm26);
+
+        zmm28 = _mm512_fmadd_ps(zmm0, zmm3, zmm28);
+        zmm29 = _mm512_fmadd_ps(zmm1, zmm3, zmm29);
+        zmm30 = _mm512_fmadd_ps(zmm6, zmm3, zmm30);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+      }//kloop
+
+      zmm0 = _mm512_set1_ps(alpha);
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm10,zmm12,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm13,zmm14,zmm16,zmm17,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm18,zmm20,zmm21,zmm22,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm24,zmm25,zmm26,zmm28,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm29,zmm30,zmm31,zmm2,zmm0)
+
+      if ( beta != 0 )
+      {
+        _cbuf = cbuf; 
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm10 = _mm512_fmadd_ps(zmm0, zmm3, zmm10);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm14 = _mm512_fmadd_ps(zmm0, zmm3, zmm14);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm18 = _mm512_fmadd_ps(zmm0, zmm3, zmm18);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm22 = _mm512_fmadd_ps(zmm0, zmm3, zmm22);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm3, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm3, zmm25);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm26 = _mm512_fmadd_ps(zmm0, zmm3, zmm26);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm28 = _mm512_fmadd_ps(zmm0, zmm3, zmm28);
+        zmm29 = _mm512_fmadd_ps(zmm1, zmm3, zmm29);
+
+        zmm0 = _mm512_load_ps(_cbuf + 32);
+        zmm30 = _mm512_fmadd_ps(zmm0, zmm3, zmm30);
+      }
+
+      // Post Ops
+      lpgemm_post_op* post_ops_list_temp = post_ops_list;
+      POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_6x48F:
+      {
+        if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+             ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+        {
+          zmm1 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+          zmm2 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+          zmm3 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+          // c[0,0-15]
+          zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+          // c[0, 16-31]
+          zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+          // c[0,32-47]
+          zmm10 = _mm512_add_ps( zmm3, zmm10 );
+
+          // c[1,0-15]
+          zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+          // c[1, 16-31]
+          zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+          // c[1,32-47]
+          zmm14 = _mm512_add_ps( zmm3, zmm14 );
+
+          // c[2,0-15]
+          zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+          // c[2, 16-31]
+          zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+          // c[2,32-47]
+          zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+          // c[3,0-15]
+          zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+          // c[3, 16-31]
+          zmm21 = _mm512_add_ps( zmm2, zmm21 );
+
+          // c[3,32-47]
+          zmm22 = _mm512_add_ps( zmm3, zmm22 );
+
+          // c[4,0-15]
+          zmm24 = _mm512_add_ps( zmm1, zmm24 );
+
+          // c[4, 16-31]
+          zmm25 = _mm512_add_ps( zmm2, zmm25 );
+
+          // c[4,32-47]
+          zmm26 = _mm512_add_ps( zmm3, zmm26 );
+
+          // c[5,0-15]
+          zmm28 = _mm512_add_ps( zmm1, zmm28 );
+
+          // c[5, 16-31]
+          zmm29 = _mm512_add_ps( zmm2, zmm29 );
+
+          // c[5,32-47]
+          zmm30 = _mm512_add_ps( zmm3, zmm30 );
+        }
+        else
+        {
+          // If original output was columns major, then by the time
+          // kernel sees it, the matrix would be accessed as if it were
+          // transposed. Due to this the bias array will be accessed by
+          // the ic index, and each bias element corresponds to an
+          // entire row of the transposed output array, instead of an
+          // entire column.
+          zmm1 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 0 ) );
+          zmm2 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 1 ) );
+          zmm3 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 2 ) );
+          zmm4 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 3 ) );
+          zmm5 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 4 ) );
+          zmm6 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 5 ) );
+
+          // c[0,0-15]
+          zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+          // c[0, 16-31]
+          zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+          // c[0,32-47]
+          zmm10 = _mm512_add_ps( zmm1, zmm10 );
+
+          // c[1,0-15]
+          zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+          // c[1, 16-31]
+          zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+          // c[1,32-47]
+          zmm14 = _mm512_add_ps( zmm2, zmm14 );
+
+          // c[2,0-15]
+          zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+          // c[2, 16-31]
+          zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+          // c[2,32-47]
+          zmm18 = _mm512_add_ps( zmm3, zmm18 );
+
+          // c[3,0-15]
+          zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+          // c[3, 16-31]
+          zmm21 = _mm512_add_ps( zmm4, zmm21 );
+
+          // c[3,32-47]
+          zmm22 = _mm512_add_ps( zmm4, zmm22 );
+
+          // c[4,0-15]
+          zmm24 = _mm512_add_ps( zmm5, zmm24 );
+
+          // c[4, 16-31]
+          zmm25 = _mm512_add_ps( zmm5, zmm25 );
+
+          // c[4,32-47]
+          zmm26 = _mm512_add_ps( zmm5, zmm26 );
+
+          // c[5,0-15]
+          zmm28 = _mm512_add_ps( zmm6, zmm28 );
+
+          // c[5, 16-31]
+          zmm29 = _mm512_add_ps( zmm6, zmm29 );
+
+          // c[5,32-47]
+          zmm30 = _mm512_add_ps( zmm6, zmm30 );
+        }
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_6x48F:
+      {
+        zmm1 = _mm512_setzero_ps();
+
+        // c[0,0-15]
+        zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+        // c[0,32-47]
+        zmm10 = _mm512_max_ps( zmm1, zmm10 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+        // c[1,16-31]
+        zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+        // c[1,32-47]
+        zmm14 = _mm512_max_ps( zmm1, zmm14 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+        // c[2,16-31]
+        zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+        // c[2,32-47]
+        zmm18 = _mm512_max_ps( zmm1, zmm18 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+        // c[3,16-31]
+        zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+        // c[3,32-47]
+        zmm22 = _mm512_max_ps( zmm1, zmm22 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_max_ps( zmm1, zmm24 );
+
+        // c[4,16-31]
+        zmm25 = _mm512_max_ps( zmm1, zmm25 );
+
+        // c[4,32-47]
+        zmm26 = _mm512_max_ps( zmm1, zmm26 );
+
+        // c[5,0-15]
+        zmm28 = _mm512_max_ps( zmm1, zmm28 );
+
+        // c[5,16-31]
+        zmm29 = _mm512_max_ps( zmm1, zmm29 );
+
+        // c[5,32-47]
+        zmm30 = _mm512_max_ps( zmm1, zmm30 );
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_SCALE_6x48F:
+      {
+        zmm1 = _mm512_setzero_ps();
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+        __mmask16 relu_cmp_mask;
+
+        // c[0, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+        // c[0, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+        // c[0, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm10)
+
+        // c[1, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+        // c[1, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+        // c[1, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm14)
+
+        // c[2, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+        // c[2, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+        // c[2, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm18)
+
+        // c[3, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+        // c[3, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+        // c[3, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm22)
+
+        // c[4, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm24)
+
+        // c[4, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm25)
+
+        // c[4, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm26)
+
+        // c[5, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm28)
+
+        // c[5, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm29)
+
+        // c[5, 32-47]
+        RELU_SCALE_OP_F32S_AVX512(zmm30)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_TANH_6x48F:
+      {
+        __m512i zmm6;
+        // c[0, 0-15]
+        GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[0, 16-31]
+        GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[0, 32-47]
+        GELU_TANH_F32S_AVX512(zmm10, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 0-15]
+        GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 16-31]
+        GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 32-47]
+        GELU_TANH_F32S_AVX512(zmm14, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 0-15]
+        GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 16-31]
+        GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 32-47]
+        GELU_TANH_F32S_AVX512(zmm18, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 0-15]
+        GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 16-31]
+        GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 32-47]
+        GELU_TANH_F32S_AVX512(zmm22, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 0-15]
+        GELU_TANH_F32S_AVX512(zmm24, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 16-31]
+        GELU_TANH_F32S_AVX512(zmm25, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 32-47]
+        GELU_TANH_F32S_AVX512(zmm26, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 0-15]
+        GELU_TANH_F32S_AVX512(zmm28, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 16-31]
+        GELU_TANH_F32S_AVX512(zmm29, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 32-47]
+        GELU_TANH_F32S_AVX512(zmm30, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_ERF_6x48F:
+      {
+        // c[0, 0-15]
+        GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+        // c[0, 16-31]
+        GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+        // c[0, 32-47]
+        GELU_ERF_F32S_AVX512(zmm10, zmm0, zmm1, zmm2)
+
+        // c[1, 0-15]
+        GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+        // c[1, 16-31]
+        GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+        // c[1, 32-47]
+        GELU_ERF_F32S_AVX512(zmm14, zmm0, zmm1, zmm2)
+
+        // c[2, 0-15]
+        GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+        // c[2, 16-31]
+        GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+        // c[2, 32-47]
+        GELU_ERF_F32S_AVX512(zmm18, zmm0, zmm1, zmm2)
+
+        // c[3, 0-15]
+        GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+        // c[3, 16-31]
+        GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+        // c[3, 32-47]
+        GELU_ERF_F32S_AVX512(zmm22, zmm0, zmm1, zmm2)
+
+        // c[4, 0-15]
+        GELU_ERF_F32S_AVX512(zmm24, zmm0, zmm1, zmm2)
+
+        // c[4, 16-31]
+        GELU_ERF_F32S_AVX512(zmm25, zmm0, zmm1, zmm2)
+
+        // c[4, 32-47]
+        GELU_ERF_F32S_AVX512(zmm26, zmm0, zmm1, zmm2)
+
+        // c[5, 0-15]
+        GELU_ERF_F32S_AVX512(zmm28, zmm0, zmm1, zmm2)
+
+        // c[5, 16-31]
+        GELU_ERF_F32S_AVX512(zmm29, zmm0, zmm1, zmm2)
+
+        // c[5, 32-47]
+        GELU_ERF_F32S_AVX512(zmm30, zmm0, zmm1, zmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_CLIP_6x48F:
+      {
+        zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+        zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+        // c[0, 0-15]
+        CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+        // c[0, 16-31]
+        CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+        // c[0, 32-47]
+        CLIP_F32S_AVX512(zmm10, zmm0, zmm1)
+
+        // c[1, 0-15]
+        CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+        // c[1, 16-31]
+        CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+        // c[1, 32-47]
+        CLIP_F32S_AVX512(zmm14, zmm0, zmm1)
+
+        // c[2, 0-15]
+        CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+        // c[2, 16-31]
+        CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+        // c[2, 32-47]
+        CLIP_F32S_AVX512(zmm18, zmm0, zmm1)
+
+        // c[3, 0-15]
+        CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+        // c[3, 16-31]
+        CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+        // c[3, 32-47]
+        CLIP_F32S_AVX512(zmm22, zmm0, zmm1)
+
+        // c[4, 0-15]
+        CLIP_F32S_AVX512(zmm24, zmm0, zmm1)
+
+        // c[4, 16-31]
+        CLIP_F32S_AVX512(zmm25, zmm0, zmm1)
+
+        // c[4, 32-47]
+        CLIP_F32S_AVX512(zmm26, zmm0, zmm1)
+
+        // c[5, 0-15]
+        CLIP_F32S_AVX512(zmm28, zmm0, zmm1)
+
+        // c[5, 16-31]
+        CLIP_F32S_AVX512(zmm29, zmm0, zmm1)
+
+        // c[5, 32-47]
+        CLIP_F32S_AVX512(zmm30, zmm0, zmm1)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_6x48F_DISABLE:
+      ;
+ 
+      _mm512_storeu_ps(cbuf, zmm8); 
+      _mm512_storeu_ps(cbuf + 16, zmm9);
+      _mm512_storeu_ps(cbuf + 32, zmm10);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm12);
+      _mm512_storeu_ps(cbuf + 16, zmm13);
+      _mm512_storeu_ps(cbuf + 32, zmm14);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm16);
+      _mm512_storeu_ps(cbuf + 16, zmm17);
+      _mm512_storeu_ps(cbuf + 32, zmm18);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm20);
+      _mm512_storeu_ps(cbuf + 16, zmm21);
+      _mm512_storeu_ps(cbuf + 32, zmm22);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm24);
+      _mm512_storeu_ps(cbuf + 16, zmm25);
+      _mm512_storeu_ps(cbuf + 32, zmm26);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm28);
+      _mm512_storeu_ps(cbuf + 16, zmm29);
+      _mm512_storeu_ps(cbuf + 32, zmm30);
+
+      post_ops_attr.post_op_c_i += MR;
+    }//mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if( m_left )
+    {
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float*  restrict cij = (float *) c + i_edge*rs_c;
+        float*  restrict ai  = (float *) a + m_iter*ps_a;
+        float*  restrict bj  = (float *) b;
+
+        lpgemm_m_fringe_f32_ker_ft ker_fps[6] =
+        {
+          NULL,
+          lpgemm_rowvar_f32f32f32of32_avx512_1x48,
+          lpgemm_rowvar_f32f32f32of32_avx512_2x48,
+          lpgemm_rowvar_f32f32f32of32_avx512_3x48,
+          lpgemm_rowvar_f32f32f32of32_avx512_4x48,
+          lpgemm_rowvar_f32f32f32of32_avx512_5x48
+        };
+
+        lpgemm_m_fringe_f32_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          k0,
+          ai, rs_a, cs_a,
+          bj, rs_b, cs_b,
+          cij,rs_c,
+          alpha, beta,
+          post_ops_list, post_ops_attr
+        );
+        return;
+    }
+}
+
+LPGEMM_N_FRINGE_KERN(float,float,float,f32f32f32of32_avx512_6x32m)
+{
+    static void* post_ops_labels[] =
+            {
+              &&POST_OPS_6x32F_DISABLE,
+              &&POST_OPS_BIAS_6x32F,
+              &&POST_OPS_RELU_6x32F,
+              &&POST_OPS_RELU_SCALE_6x32F,
+              &&POST_OPS_GELU_TANH_6x32F,
+              &&POST_OPS_GELU_ERF_6x32F,
+              &&POST_OPS_CLIP_6x32F
+            };
+    // Typecast local copies of integers in case dim_t and inc_t are a
+    // different size than is expected by load instructions.
+    uint64_t k_iter = k0;
+
+    uint64_t m_iter = m0 / 6;
+    uint64_t m_left = m0 % 6;
+
+    // Query the panel stride of A and convert it to units of bytes.
+    if ( m_iter == 0 ){    goto consider_edge_cases; }
+
+    __m512 zmm0, zmm1, zmm2, zmm3, zmm4, zmm5;
+    __m512 zmm6, zmm8, zmm9, zmm12, zmm13;
+    __m512 zmm16, zmm17, zmm20, zmm21;
+    __m512 zmm24, zmm25, zmm28, zmm29;
+
+    /*Produce MRxNR outputs */
+    for(dim_t m=0; m < m_iter; m++)
+    {
+      /* zero the accumulator registers */
+      ZERO_ACC_ZMM_4_REG(zmm8, zmm9, zmm12, zmm13);
+      ZERO_ACC_ZMM_4_REG(zmm16, zmm17, zmm20, zmm21);
+      ZERO_ACC_ZMM_4_REG(zmm24, zmm25, zmm28, zmm29);
+
+      _mm256_zeroupper();
+
+      float *abuf, *bbuf, *cbuf, *_cbuf;
+
+      abuf = (float *)a + m * ps_a; // Move to next MRxKC in MCxKC (where MC>=MR)
+      bbuf = (float *)b;  //Same KCxNR is used across different MRxKC in MCxKC
+      cbuf = (float *)c + m * MR * rs_c; // Move to next MRXNR in output
+
+      for(dim_t k = 0; k < k_iter; k++)
+      {
+        /*Load 32 elements from row0 of B*/
+        zmm0 = _mm512_loadu_ps (bbuf );     //load 0-15 values from current row 
+        zmm1 = _mm512_loadu_ps (bbuf + 16); //load 16-31 values from current row
+
+       /*Broadcast col0 elements of 12 rows of A*/
+        zmm2 = _mm512_set1_ps(*(abuf + 0*rs_a)); //broadcast c0r0
+        zmm3 = _mm512_set1_ps(*(abuf + 1*rs_a)); //broadcast c0r1  
+        zmm4 = _mm512_set1_ps(*(abuf + 2*rs_a)); //broadcast c0r2 
+        zmm5 = _mm512_set1_ps(*(abuf + 3*rs_a)); //broadcast c0r3
+
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm2, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm2, zmm9);
+
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+
+        zmm2 = _mm512_set1_ps(*(abuf + 4*rs_a)); //broadcast c0r4
+        zmm3 = _mm512_set1_ps(*(abuf + 5*rs_a)); //broadcast c0r5
+        
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm4, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm4, zmm17);
+        
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm5, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm5, zmm21);
+
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm2, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm2, zmm25);
+
+        zmm28 = _mm512_fmadd_ps(zmm0, zmm3, zmm28);
+        zmm29 = _mm512_fmadd_ps(zmm1, zmm3, zmm29);
+
+        bbuf += rs_b;  //move b pointer to next row
+        abuf += cs_a;  //move a pointer to next col
+
+      }//kloop
+
+      zmm0 = _mm512_set1_ps(alpha);
+      
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm8,zmm9,zmm12,zmm13,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm16,zmm17,zmm20,zmm21,zmm0)
+      ALPHA_MUL_ACC_ZMM_4_REG(zmm24,zmm25,zmm28,zmm29,zmm0)
+
+      if ( beta != 0 )
+      {
+        _cbuf = cbuf; 
+        //load c and multiply with beta and 
+        //add to accumulator and store back
+        zmm3 = _mm512_set1_ps(beta);
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm8 = _mm512_fmadd_ps(zmm0, zmm3, zmm8);
+        zmm9 = _mm512_fmadd_ps(zmm1, zmm3, zmm9);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm12 = _mm512_fmadd_ps(zmm0, zmm3, zmm12);
+        zmm13 = _mm512_fmadd_ps(zmm1, zmm3, zmm13);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm16 = _mm512_fmadd_ps(zmm0, zmm3, zmm16);
+        zmm17 = _mm512_fmadd_ps(zmm1, zmm3, zmm17);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm20 = _mm512_fmadd_ps(zmm0, zmm3, zmm20);
+        zmm21 = _mm512_fmadd_ps(zmm1, zmm3, zmm21);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm24 = _mm512_fmadd_ps(zmm0, zmm3, zmm24);
+        zmm25 = _mm512_fmadd_ps(zmm1, zmm3, zmm25);
+        _cbuf += rs_c;
+
+        zmm0 = _mm512_load_ps(_cbuf);
+        zmm1 = _mm512_load_ps(_cbuf + 16);
+        zmm28 = _mm512_fmadd_ps(zmm0, zmm3, zmm28);
+        zmm29 = _mm512_fmadd_ps(zmm1, zmm3, zmm29);
+      }
+
+      // Post Ops
+      lpgemm_post_op* post_ops_list_temp = post_ops_list;
+      POST_OP_LABEL_LASTK_SAFE_JUMP
+
+POST_OPS_BIAS_6x32F:
+      {
+        if ( ( *( char* )post_ops_list_temp->op_args2 == 'r' ) ||
+             ( *( char* )post_ops_list_temp->op_args2 == 'R' ) )
+        {
+          zmm1 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+          zmm2 =
+            _mm512_loadu_ps( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+          // c[0,0-15]
+          zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+          // c[0, 16-31]
+          zmm9 = _mm512_add_ps( zmm2, zmm9 );
+
+          // c[1,0-15]
+          zmm12 = _mm512_add_ps( zmm1, zmm12 );
+
+          // c[1, 16-31]
+          zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+          // c[2,0-15]
+          zmm16 = _mm512_add_ps( zmm1, zmm16 );
+
+          // c[2, 16-31]
+          zmm17 = _mm512_add_ps( zmm2, zmm17 );
+
+          // c[3,0-15]
+          zmm20 = _mm512_add_ps( zmm1, zmm20 );
+
+          // c[3, 16-31]
+          zmm21 = _mm512_add_ps( zmm2, zmm21 );
+
+          // c[4,0-15]
+          zmm24 = _mm512_add_ps( zmm1, zmm24 );
+
+          // c[4, 16-31]
+          zmm25 = _mm512_add_ps( zmm2, zmm25 );
+
+          // c[5,0-15]
+          zmm28 = _mm512_add_ps( zmm1, zmm28 );
+
+          // c[5, 16-31]
+          zmm29 = _mm512_add_ps( zmm2, zmm29 );
+        }
+        else
+        {
+          // If original output was columns major, then by the time
+          // kernel sees it, the matrix would be accessed as if it were
+          // transposed. Due to this the bias array will be accessed by
+          // the ic index, and each bias element corresponds to an
+          // entire row of the transposed output array, instead of an
+          // entire column.
+          zmm1 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 0 ) );
+          zmm2 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 1 ) );
+          zmm3 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 2 ) );
+          zmm4 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 3 ) );
+          zmm5 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 4 ) );
+          zmm6 =
+            _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args1 +
+              post_ops_attr.post_op_c_i + 5 ) );
+
+          // c[0,0-15]
+          zmm8 = _mm512_add_ps( zmm1, zmm8 );
+
+          // c[0, 16-31]
+          zmm9 = _mm512_add_ps( zmm1, zmm9 );
+
+          // c[1,0-15]
+          zmm12 = _mm512_add_ps( zmm2, zmm12 );
+
+          // c[1, 16-31]
+          zmm13 = _mm512_add_ps( zmm2, zmm13 );
+
+          // c[2,0-15]
+          zmm16 = _mm512_add_ps( zmm3, zmm16 );
+
+          // c[2, 16-31]
+          zmm17 = _mm512_add_ps( zmm3, zmm17 );
+
+          // c[3,0-15]
+          zmm20 = _mm512_add_ps( zmm4, zmm20 );
+
+          // c[3, 16-31]
+          zmm21 = _mm512_add_ps( zmm4, zmm21 );
+
+          // c[4,0-15]
+          zmm24 = _mm512_add_ps( zmm5, zmm24 );
+
+          // c[4, 16-31]
+          zmm25 = _mm512_add_ps( zmm5, zmm25 );
+
+          // c[5,0-15]
+          zmm28 = _mm512_add_ps( zmm6, zmm28 );
+
+          // c[5, 16-31]
+          zmm29 = _mm512_add_ps( zmm6, zmm29 );
+        }
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_6x32F:
+      {
+        zmm1 = _mm512_setzero_ps();
+
+        // c[0,0-15]
+        zmm8 = _mm512_max_ps( zmm1, zmm8 );
+
+        // c[0, 16-31]
+        zmm9 = _mm512_max_ps( zmm1, zmm9 );
+
+        // c[1,0-15]
+        zmm12 = _mm512_max_ps( zmm1, zmm12 );
+
+        // c[1,16-31]
+        zmm13 = _mm512_max_ps( zmm1, zmm13 );
+
+        // c[2,0-15]
+        zmm16 = _mm512_max_ps( zmm1, zmm16 );
+
+        // c[2,16-31]
+        zmm17 = _mm512_max_ps( zmm1, zmm17 );
+
+        // c[3,0-15]
+        zmm20 = _mm512_max_ps( zmm1, zmm20 );
+
+        // c[3,16-31]
+        zmm21 = _mm512_max_ps( zmm1, zmm21 );
+
+        // c[4,0-15]
+        zmm24 = _mm512_max_ps( zmm1, zmm24 );
+
+        // c[4,16-31]
+        zmm25 = _mm512_max_ps( zmm1, zmm25 );
+
+        // c[5,0-15]
+        zmm28 = _mm512_max_ps( zmm1, zmm28 );
+
+        // c[5,16-31]
+        zmm29 = _mm512_max_ps( zmm1, zmm29 );
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_RELU_SCALE_6x32F:
+      {
+        zmm1 = _mm512_setzero_ps();
+        zmm2 =
+          _mm512_set1_ps( *( ( float* )post_ops_list_temp->op_args2 ) );
+
+        __mmask16 relu_cmp_mask;
+
+        // c[0, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm8)
+
+        // c[0, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm9)
+
+        // c[1, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm12)
+
+        // c[1, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm13)
+
+        // c[2, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm16)
+
+        // c[2, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm17)
+
+        // c[3, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm20)
+
+        // c[3, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm21)
+
+        // c[4, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm24)
+
+        // c[4, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm25)
+
+        // c[5, 0-15]
+        RELU_SCALE_OP_F32S_AVX512(zmm28)
+
+        // c[5, 16-31]
+        RELU_SCALE_OP_F32S_AVX512(zmm29)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_TANH_6x32F:
+      {
+        __m512i zmm6;
+        // c[0, 0-15]
+        GELU_TANH_F32S_AVX512(zmm8, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[0, 16-31]
+        GELU_TANH_F32S_AVX512(zmm9, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 0-15]
+        GELU_TANH_F32S_AVX512(zmm12, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[1, 16-31]
+        GELU_TANH_F32S_AVX512(zmm13, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 0-15]
+        GELU_TANH_F32S_AVX512(zmm16, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[2, 16-31]
+        GELU_TANH_F32S_AVX512(zmm17, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 0-15]
+        GELU_TANH_F32S_AVX512(zmm20, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[3, 16-31]
+        GELU_TANH_F32S_AVX512(zmm21, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 0-15]
+        GELU_TANH_F32S_AVX512(zmm24, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[4, 16-31]
+        GELU_TANH_F32S_AVX512(zmm25, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 0-15]
+        GELU_TANH_F32S_AVX512(zmm28, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        // c[5, 16-31]
+        GELU_TANH_F32S_AVX512(zmm29, zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_GELU_ERF_6x32F:
+      {
+        // c[0, 0-15]
+        GELU_ERF_F32S_AVX512(zmm8, zmm0, zmm1, zmm2)
+
+        // c[0, 16-31]
+        GELU_ERF_F32S_AVX512(zmm9, zmm0, zmm1, zmm2)
+
+        // c[1, 0-15]
+        GELU_ERF_F32S_AVX512(zmm12, zmm0, zmm1, zmm2)
+
+        // c[1, 16-31]
+        GELU_ERF_F32S_AVX512(zmm13, zmm0, zmm1, zmm2)
+
+        // c[2, 0-15]
+        GELU_ERF_F32S_AVX512(zmm16, zmm0, zmm1, zmm2)
+
+        // c[2, 16-31]
+        GELU_ERF_F32S_AVX512(zmm17, zmm0, zmm1, zmm2)
+
+        // c[3, 0-15]
+        GELU_ERF_F32S_AVX512(zmm20, zmm0, zmm1, zmm2)
+
+        // c[3, 16-31]
+        GELU_ERF_F32S_AVX512(zmm21, zmm0, zmm1, zmm2)
+
+        // c[4, 0-15]
+        GELU_ERF_F32S_AVX512(zmm24, zmm0, zmm1, zmm2)
+
+        // c[4, 16-31]
+        GELU_ERF_F32S_AVX512(zmm25, zmm0, zmm1, zmm2)
+
+        // c[5, 0-15]
+        GELU_ERF_F32S_AVX512(zmm28, zmm0, zmm1, zmm2)
+
+        // c[5, 16-31]
+        GELU_ERF_F32S_AVX512(zmm29, zmm0, zmm1, zmm2)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_CLIP_6x32F:
+      {
+        zmm0 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args2 );
+        zmm1 = _mm512_set1_ps( *( float* )post_ops_list_temp->op_args3 );
+
+        // c[0, 0-15]
+        CLIP_F32S_AVX512(zmm8, zmm0, zmm1)
+
+        // c[0, 16-31]
+        CLIP_F32S_AVX512(zmm9, zmm0, zmm1)
+
+        // c[1, 0-15]
+        CLIP_F32S_AVX512(zmm12, zmm0, zmm1)
+
+        // c[1, 16-31]
+        CLIP_F32S_AVX512(zmm13, zmm0, zmm1)
+
+        // c[2, 0-15]
+        CLIP_F32S_AVX512(zmm16, zmm0, zmm1)
+
+        // c[2, 16-31]
+        CLIP_F32S_AVX512(zmm17, zmm0, zmm1)
+
+        // c[3, 0-15]
+        CLIP_F32S_AVX512(zmm20, zmm0, zmm1)
+
+        // c[3, 16-31]
+        CLIP_F32S_AVX512(zmm21, zmm0, zmm1)
+
+        // c[4, 0-15]
+        CLIP_F32S_AVX512(zmm24, zmm0, zmm1)
+
+        // c[4, 16-31]
+        CLIP_F32S_AVX512(zmm25, zmm0, zmm1)
+
+        // c[5, 0-15]
+        CLIP_F32S_AVX512(zmm28, zmm0, zmm1)
+
+        // c[5, 16-31]
+        CLIP_F32S_AVX512(zmm29, zmm0, zmm1)
+
+        POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+      }
+POST_OPS_6x32F_DISABLE:
+      ;
+
+      _mm512_storeu_ps(cbuf, zmm8); 
+      _mm512_storeu_ps(cbuf + 16, zmm9);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm12);
+      _mm512_storeu_ps(cbuf + 16, zmm13);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm16);
+      _mm512_storeu_ps(cbuf + 16, zmm17);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm20);
+      _mm512_storeu_ps(cbuf + 16, zmm21);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm24);
+      _mm512_storeu_ps(cbuf + 16, zmm25);
+      cbuf += rs_c;
+      _mm512_storeu_ps(cbuf, zmm28);
+      _mm512_storeu_ps(cbuf + 16, zmm29);
+
+      post_ops_attr.post_op_c_i += MR;
+    }//mloop
+
+    consider_edge_cases:
+
+    // Handle edge cases in the m dimension, if they exist.
+    if( m_left )
+    {
+        const dim_t      i_edge = m0 - ( dim_t )m_left;
+
+        float*  restrict cij = (float *) c + i_edge*rs_c;
+        float*  restrict ai  = (float *) a + m_iter*ps_a;
+        float*  restrict bj  = (float *) b;
+
+        lpgemm_m_fringe_f32_ker_ft ker_fps[6] =
+        {
+          NULL,
+          lpgemm_rowvar_f32f32f32of32_avx512_1x32,
+          lpgemm_rowvar_f32f32f32of32_avx512_2x32,
+          lpgemm_rowvar_f32f32f32of32_avx512_3x32,
+          lpgemm_rowvar_f32f32f32of32_avx512_4x32,
+          lpgemm_rowvar_f32f32f32of32_avx512_5x32
+        };
+
+        lpgemm_m_fringe_f32_ker_ft ker_fp = ker_fps[ m_left ];
+
+        ker_fp
+        (
+          k0,
+          ai, rs_a, cs_a,
+          bj, rs_b, cs_b,
+          cij,rs_c,
+          alpha, beta,
+          post_ops_list, post_ops_attr
+        );
+        return;
+    }
+}
+#endif
diff --git a/kernels/zen4/lpgemm/gelu_avx512.h b/kernels/zen4/lpgemm/gelu_avx512.h
new file mode 100644
index 0000000000..814f136f50
--- /dev/null
+++ b/kernels/zen4/lpgemm/gelu_avx512.h
@@ -0,0 +1,65 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef AOCL_LPGEMM_GELU_DEF_AVX512_H
+#define AOCL_LPGEMM_GELU_DEF_AVX512_H
+
+/* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */
+#define GELU_TANH_F32_AVX512_DEF(reg, r, r2, x, z, dn, x_tanh, q) \
+\
+	r2 = _mm512_mul_ps (reg, reg); \
+	r2 = _mm512_mul_ps (r2, reg); \
+	x_tanh = _mm512_fmadd_ps (_mm512_set1_ps (0.044715), r2, reg); \
+	x_tanh = _mm512_mul_ps (x_tanh, _mm512_set1_ps (0.797884)); \
+\
+	/*x_tanh = tanhf(x_tanh) */  \
+	TANHF_AVX512(x_tanh, r, r2, x, z, dn, q); \
+\
+	x_tanh = _mm512_add_ps (x_tanh, _mm512_set1_ps (1)); \
+	x_tanh = _mm512_mul_ps (x_tanh, reg); \
+	reg = _mm512_mul_ps (x_tanh, _mm512_set1_ps (0.5));
+
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_F32_AVX512_DEF(reg, r, x, x_erf) \
+\
+  x_erf = _mm512_mul_ps (reg, _mm512_set1_ps (0.707107)); \
+\
+  /*x_erf = erf(x_erf) */  \
+  ERF_AVX512(x_erf, r, x); \
+\
+  x_erf = _mm512_add_ps (x_erf, _mm512_set1_ps (1)); \
+  x_erf = _mm512_mul_ps (x_erf, reg); \
+  reg = _mm512_mul_ps (x_erf, _mm512_set1_ps (0.5));
+
+#endif // AOCL_LPGEMM_GELU_DEF_AVX512_H
diff --git a/kernels/zen4/lpgemm/lpgemm_util_l1_ops_avx512.c b/kernels/zen4/lpgemm/lpgemm_util_l1_ops_avx512.c
new file mode 100644
index 0000000000..36ad94569d
--- /dev/null
+++ b/kernels/zen4/lpgemm/lpgemm_util_l1_ops_avx512.c
@@ -0,0 +1,295 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "math_utils_avx512.h"
+#include "gelu_avx512.h"
+
+// TANH GeLU (x) = 0.5 * x * ( 1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )
+#define GELU_TANH_NONVEC(in_val) \
+	( in_val ) = 0.5 * ( ( double )( in_val ) ) * \
+	( \
+	  1 + tanhf \
+	  ( \
+	    0.797884 * \
+	    ( \
+	   	  ( double )( in_val ) + \
+		  ( \
+		    0.044715 * \
+		    ( ( double )( in_val ) * ( double )( in_val ) * ( double )( in_val ) ) \
+		  ) \
+	    ) \
+	  ) \
+	); \
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_NONVEC(in_val) \
+	( in_val ) = 0.5 * ( double )( in_val ) * \
+		( 1 + erff( ( double )( in_val ) * 0.707107 ) ); \
+
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_gelu_tanh_avx512)
+{
+	if ( incx == 1 )
+	{
+		dim_t n_part16 = ( n / 16 ) * 16;
+		dim_t n_part16_rem = n - n_part16;
+
+		dim_t idx = 0;
+		__m512 zmm0, zmm10, zmm11, zmm12, zmm13, zmm14, zmm15;
+		__m512i zmm10i;
+		// avx512 block loop.
+		for ( idx = 0; idx < n_part16; idx += 16 )
+		{
+			zmm0 = _mm512_loadu_ps( x + idx );
+
+			GELU_TANH_F32_AVX512_DEF(zmm0, zmm10, zmm11, zmm12, \
+							zmm13, zmm14, zmm15, zmm10i);
+
+			_mm512_storeu_ps( x + idx, zmm0 );
+		}
+
+		// Process remainder using masked load.
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n_part16_rem ) );
+		zmm0 = _mm512_maskz_loadu_ps( load_mask, x + idx );
+
+		GELU_TANH_F32_AVX512_DEF(zmm0, zmm10, zmm11, zmm12, \
+						zmm13, zmm14, zmm15, zmm10i);
+
+		_mm512_mask_storeu_ps( x + idx, load_mask, zmm0 );
+	}
+	// For non unit increment, use non-vectorized code.
+	else
+	{
+		dim_t n_incx = n * incx;
+		for ( dim_t idx = 0; idx < n_incx; idx += incx )
+		{
+			float temp_val = *( x + idx );
+			*( x + idx ) = GELU_TANH_NONVEC(temp_val);
+		}
+	}
+}
+
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_gelu_erf_avx512)
+{
+	if ( incx == 1 )
+	{
+		dim_t n_part16 = ( n / 16 ) * 16;
+		dim_t n_part16_rem = n - n_part16;
+
+		dim_t idx = 0;
+		__m512 zmm0, zmm10, zmm11, zmm12;
+		// avx512 block loop.
+		for ( idx = 0; idx < n_part16; idx += 16 )
+		{
+			zmm0 = _mm512_loadu_ps( x + idx );
+
+			GELU_ERF_F32_AVX512_DEF(zmm0, zmm10, zmm11, zmm12);
+
+			_mm512_storeu_ps( x + idx, zmm0 );
+		}
+
+		// Process remainder using masked load.
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n_part16_rem ) );
+		zmm0 = _mm512_maskz_loadu_ps( load_mask, x + idx );
+
+		GELU_ERF_F32_AVX512_DEF(zmm0, zmm10, zmm11, zmm12);
+
+		_mm512_mask_storeu_ps( x + idx, load_mask, zmm0 );
+	}
+	// For non unit increment, use non-vectorized code.
+	else
+	{
+		dim_t n_incx = n * incx;
+		for ( dim_t idx = 0; idx < n_incx; idx += incx )
+		{
+			float temp_val = *( x + idx );
+			*( x + idx ) = GELU_ERF_NONVEC(temp_val);
+		}
+	}
+}
+
+LPGEMM_UTIL_L1_OP_KERNEL(float,f32_softmax_avx512)
+{
+	if ( incx == 1 )
+	{
+		double exp_sum[2] = { 0.0 };
+
+		dim_t n_part16 = ( n / 16 ) * 16;
+		dim_t n_part16_rem = n - n_part16;
+
+		dim_t idx = 0;
+		__m512 zmm0, zmm10, zmm11, zmm12, zmm13, zmm10out;
+		__m512i zmm10outi;
+		__m256 ymm0, ymm1;
+		__m512d zmm10out_d0, zmm10out_d1;
+		__m256d ymm0d, ymm1d;
+		__m128d xmm0d, xmm1d;
+
+		// Exp reduction of the array.
+		for ( idx = 0; idx < n_part16; idx += 16 )
+		{
+			zmm0 = _mm512_loadu_ps( x + idx );
+
+			EXPF_AVX512(zmm0, zmm10, zmm11, zmm12, zmm13, zmm10outi); // zmm10out is the output
+			zmm10out = _mm512_castsi512_ps( zmm10outi );
+
+			// Reduction to be done as double data type.
+			ymm0 = _mm512_castps512_ps256( zmm10out );
+			ymm1 = _mm512_extractf32x8_ps( zmm10out, 0x1 );
+			zmm10out_d0 = _mm512_cvtps_pd( ymm0 );
+			zmm10out_d1 = _mm512_cvtps_pd( ymm1 );
+			zmm10out_d0 = _mm512_add_pd( zmm10out_d0, zmm10out_d1 );
+
+			ymm0d = _mm512_castpd512_pd256( zmm10out_d0 );
+			ymm1d = _mm512_extractf64x4_pd( zmm10out_d0, 0x1 );
+			ymm0d = _mm256_add_pd( ymm0d, ymm1d );
+
+			xmm0d = _mm256_castpd256_pd128( ymm0d );
+			xmm1d = _mm256_extractf128_pd( ymm0d, 0x1 );
+			xmm0d = _mm_add_pd( xmm0d, xmm1d );
+
+			xmm1d = _mm_permute_pd( xmm0d, 0x01);
+			xmm0d = _mm_add_pd( xmm0d, xmm1d );
+			exp_sum[1] = _mm_cvtsd_f64( xmm0d );
+			exp_sum[0] += exp_sum[1];
+		}
+
+		// Process remainder using masked load.
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n_part16_rem ) );
+		zmm0 = _mm512_maskz_loadu_ps( load_mask, x + idx );
+
+		EXPF_AVX512(zmm0, zmm10, zmm11, zmm12, zmm13, zmm10outi); // zmm10out is the output
+		zmm10out = _mm512_castsi512_ps( zmm10outi );
+
+		// Ensure only n_part16_rem elements are valid, zero out rest.
+		// This is required since exp(0) = 1.
+		load_mask = _kxor_mask16( load_mask, _cvtu32_mask16( 0xFFFF ) );
+		zmm10out = _mm512_mask_xor_ps( zmm10out, load_mask, zmm10out, zmm10out );
+
+		// Reduction to be done as double data type.
+		ymm0 = _mm512_castps512_ps256( zmm10out );
+		ymm1 = _mm512_extractf32x8_ps( zmm10out, 0x1 );
+		zmm10out_d0 = _mm512_cvtps_pd( ymm0 );
+		zmm10out_d1 = _mm512_cvtps_pd( ymm1 );
+		zmm10out_d0 = _mm512_add_pd( zmm10out_d0, zmm10out_d1 );
+
+		ymm0d = _mm512_castpd512_pd256( zmm10out_d0 );
+		ymm1d = _mm512_extractf64x4_pd( zmm10out_d0, 0x1 );
+		ymm0d = _mm256_add_pd( ymm0d, ymm1d );
+
+		xmm0d = _mm256_castpd256_pd128( ymm0d );
+		xmm1d = _mm256_extractf128_pd( ymm0d, 0x1 );
+		xmm0d = _mm_add_pd( xmm0d, xmm1d );
+
+		xmm1d = _mm_permute_pd( xmm0d, 0x01);
+		xmm0d = _mm_add_pd( xmm0d, xmm1d );
+		exp_sum[1] = _mm_cvtsd_f64( xmm0d );
+		exp_sum[0] += exp_sum[1];
+
+		// Broadcast the double exp sum.
+		__m512d exp_red_zmm0;
+		exp_sum[1] = exp_sum[0];
+		xmm0d = _mm_loadu_pd( exp_sum );
+		exp_red_zmm0 = _mm512_broadcastsd_pd( xmm0d );
+
+		// Exp division of the array.
+		for ( idx = 0; idx < n_part16; idx += 16 )
+		{
+			zmm0 = _mm512_loadu_ps( x + idx );
+
+			// Convert to double
+			ymm0 = _mm512_castps512_ps256( zmm0 );
+			ymm1 = _mm512_extractf32x8_ps( zmm0, 0x1 );
+			zmm10out_d0 = _mm512_cvtps_pd( ymm0 );
+			zmm10out_d1 = _mm512_cvtps_pd( ymm1 );
+
+			// Divide at double level
+			zmm10out_d0 = _mm512_div_pd( zmm10out_d0, exp_red_zmm0 );
+			zmm10out_d1 = _mm512_div_pd( zmm10out_d1, exp_red_zmm0 );
+
+			ymm0 = _mm512_cvtpd_ps( zmm10out_d0 );
+			ymm1 = _mm512_cvtpd_ps( zmm10out_d1 );
+
+			_mm256_storeu_ps( x + idx, ymm0 );
+			_mm256_storeu_ps( x + idx + 8, ymm1 );
+		}
+
+		// Process remainder using masked load.
+		load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n_part16_rem ) );
+		zmm0 = _mm512_maskz_loadu_ps( load_mask, x + idx );
+
+		// Convert to double
+		ymm0 = _mm512_castps512_ps256( zmm0 );
+		ymm1 = _mm512_extractf32x8_ps( zmm0, 0x1 );
+		zmm10out_d0 = _mm512_cvtps_pd( ymm0 );
+		zmm10out_d1 = _mm512_cvtps_pd( ymm1 );
+
+		// Divide at double level
+		zmm10out_d0 = _mm512_div_pd( zmm10out_d0, exp_red_zmm0 );
+		zmm10out_d1 = _mm512_div_pd( zmm10out_d1, exp_red_zmm0 );
+
+		ymm0 = _mm512_cvtpd_ps( zmm10out_d0 );
+		ymm1 = _mm512_cvtpd_ps( zmm10out_d1 );
+
+		zmm0 = _mm512_castps256_ps512( ymm0 );
+		zmm0 = _mm512_insertf32x8( zmm0, ymm1, 0x1 );
+
+		_mm512_mask_storeu_ps( x + idx, load_mask, zmm0 );
+	}
+	// For non unit increment, use non-vectorized code.
+	else
+	{
+		double exp_sum = 0.0;
+
+		dim_t n_incx = n * incx;
+
+		// Exp reduction of the array.
+		for ( dim_t idx = 0; idx < n_incx; idx += incx )
+		{
+			float temp_val = *( x + idx );
+			exp_sum += (double)( expf( temp_val ) );
+		}
+		// Exp division of the array.
+		for ( dim_t idx = 0; idx < n_incx; idx += incx )
+		{
+			float temp_val = *( x + idx );
+			*( x + idx ) = ( float )( ( double ) temp_val / exp_sum );
+		}
+	}
+}
+#endif
diff --git a/kernels/zen4/lpgemm/math_utils_avx512.h b/kernels/zen4/lpgemm/math_utils_avx512.h
new file mode 100644
index 0000000000..82c9c5650b
--- /dev/null
+++ b/kernels/zen4/lpgemm/math_utils_avx512.h
@@ -0,0 +1,119 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+#ifndef AOCL_LPGEMM_MATH_UTILS_AVX512_H
+#define AOCL_LPGEMM_MATH_UTILS_AVX512_H
+
+//constants for exp function
+#define lpgemm_exp_c0 0x1.0000014439a91p0
+#define lpgemm_exp_c1 0x1.62e43170e3344p-1
+#define lpgemm_exp_c2 0x1.ebf906bc4c115p-3
+#define lpgemm_exp_c3 0x1.c6ae2bb88c0c8p-5
+#define lpgemm_exp_c4 0x1.3d1079db4ef69p-7
+#define lpgemm_exp_c5 0x1.5f8905cb0cc4ep-10
+
+#define TBL_LN2 0x1.71547652b82fep+0
+#define EXPF_HUGE 0x1.8p+23
+#define EXPF_MIN -88.7228393f
+#define EXPF_MAX 88.7228393f
+#define inf 1.0/0.0
+#define sign -2147483648
+
+//constants for erf function
+#define lpgemm_erf_c0 0x1.20dd7890d27e1cec99fce48c29cp0
+#define lpgemm_erf_c1 -0x1.ab4bed70f238422edeeba9c558p-16
+#define lpgemm_erf_c2 -0x1.80a1bd5878e0b0689c5ff4fcdd4p-2
+#define lpgemm_erf_c3 -0x1.07cb4cde6a7d9528c8a732990e4p-8
+#define lpgemm_erf_c4 0x1.092cba598f96f00ddc5854cf7cp-3
+#define lpgemm_erf_c5 -0x1.51f0ce4ac87c55f11f685864714p-5
+#define lpgemm_erf_c6 0x1.4101f320bf8bc4d41c228faaa6cp-5
+#define lpgemm_erf_c7 -0x1.2300882a7d1b712726997de80ep-4
+#define lpgemm_erf_c8 0x1.d45745fff0e4b6d0604a9ab6284p-5
+#define lpgemm_erf_c9 -0x1.9eb1491956e31ded96176d7c8acp-6
+#define lpgemm_erf_c10 0x1.b9183fc75d326b9044bc63c9694p-8
+#define lpgemm_erf_c11 -0x1.10e8f8c89ad8645e7d769cd596cp-10
+#define lpgemm_erf_c12 0x1.224ffc80cc19957a48ecedad6c8p-14
+#define lpgemm_erf_c13 0x1.12a30f42c71308321e7e7cb0174p-18
+#define lpgemm_erf_c14 -0x1.155445e2e006723066d72d22ddcp-20
+#define lpgemm_erf_c15 0x1.c6a4181da4ef76f22bd39bb5dcp-25
+
+//Trignometric EXP, TANH and ERF functions for AVX512
+
+#define POLY_EVAL_6_AVX512(r, r2, z) \
+    r2 = _mm512_mul_ps (r, r); \
+    z = _mm512_fmadd_ps (r2, _mm512_fmadd_ps (r, _mm512_set1_ps(lpgemm_exp_c3), _mm512_set1_ps(lpgemm_exp_c2)), \
+        _mm512_fmadd_ps (r, _mm512_set1_ps(lpgemm_exp_c1), _mm512_set1_ps(lpgemm_exp_c0))); \
+    r2 = _mm512_mul_ps (r2, r2); \
+    r = _mm512_fmadd_ps (r2, _mm512_fmadd_ps (r, _mm512_set1_ps(lpgemm_exp_c5), _mm512_set1_ps(lpgemm_exp_c4)), z); \
+
+#define EXPF_AVX512(x, r, r2, z, dn, q) \
+    z = _mm512_mul_ps (x, _mm512_set1_ps(TBL_LN2));	\
+	dn = _mm512_add_ps (z , _mm512_set1_ps(EXPF_HUGE));  \
+    r = _mm512_sub_ps (z , _mm512_sub_ps (dn , _mm512_set1_ps(EXPF_HUGE)));  \
+\
+    POLY_EVAL_6_AVX512 (r, r2, z); \
+\
+    q = _mm512_add_epi32((__m512i) (r), _mm512_sllv_epi32 ((__m512i)dn, _mm512_set1_epi32 (23)) ); \
+    q = _mm512_mask_and_epi32 ((__m512i) q, _mm512_cmpnle_ps_mask ( _mm512_set1_ps(EXPF_MIN), x), (__m512i)q, _mm512_set1_epi32(0)); \
+    q = _mm512_mask_xor_epi32 ((__m512i)_mm512_set1_ps(inf), _mm512_cmpnle_ps_mask ( _mm512_set1_ps(EXPF_MAX), x), (__m512i)q, _mm512_set1_epi32(0));
+
+#define TANHF_AVX512(x_tanh, r, r2, x, z, dn, q) \
+    x = _mm512_mul_ps (_mm512_abs_ps (x_tanh), _mm512_set1_ps(-2) ); \
+\
+    EXPF_AVX512(x, r, r2, z, dn, q); \
+\
+    z =  _mm512_add_ps ((__m512)q, _mm512_set1_ps(-1)); \
+    z = _mm512_div_ps (z, _mm512_add_ps (z, _mm512_set1_ps(2))); \
+    z = _mm512_mul_ps (z, _mm512_set1_ps(-1)); \
+    x_tanh = (__m512)(_mm512_xor_epi32 (_mm512_and_epi32 ((__m512i)x_tanh, (_mm512_set1_epi32(sign))), (__m512i)z)) ;
+
+#define POLY_EVAL_HORNER_16_0_AVX512(r,x) \
+    x = _mm512_mul_ps (_mm512_fmadd_ps ( \
+    _mm512_fmadd_ps(_mm512_fmadd_ps (_mm512_fmadd_ps (_mm512_fmadd_ps (_mm512_fmadd_ps ( _mm512_fmadd_ps ( \
+    _mm512_fmadd_ps (_mm512_fmadd_ps (_mm512_fmadd_ps (_mm512_fmadd_ps (_mm512_fmadd_ps (_mm512_fmadd_ps ( \
+    _mm512_fmadd_ps ( _mm512_fmadd_ps (r, _mm512_set1_ps(lpgemm_erf_c15), _mm512_set1_ps(lpgemm_erf_c14)), r, _mm512_set1_ps(lpgemm_erf_c13)), \
+    r, _mm512_set1_ps(lpgemm_erf_c12)), r,  _mm512_set1_ps(lpgemm_erf_c11)), r, _mm512_set1_ps(lpgemm_erf_c10)), r, _mm512_set1_ps(lpgemm_erf_c9)), \
+    r, _mm512_set1_ps(lpgemm_erf_c8)), r, _mm512_set1_ps(lpgemm_erf_c7)), r, _mm512_set1_ps(lpgemm_erf_c6)), r, _mm512_set1_ps(lpgemm_erf_c5)), r, \
+    _mm512_set1_ps(lpgemm_erf_c4)), r, _mm512_set1_ps(lpgemm_erf_c3)), r, _mm512_set1_ps(lpgemm_erf_c2)), r, _mm512_set1_ps(lpgemm_erf_c1)), r, \
+    _mm512_set1_ps(lpgemm_erf_c0)), r); \
+
+#define ERF_AVX512(x_erf, r, x) \
+    r = (__m512)_mm512_and_epi32 ((__m512i)x_erf, _mm512_set1_epi32(0x7FFFFFFF)); \
+\
+    POLY_EVAL_HORNER_16_0_AVX512(r,x); \
+\
+    x = (__m512)_mm512_mask_xor_epi32 ((__m512i)_mm512_set1_ps(1), _mm512_cmpnle_ps_mask \
+        ( _mm512_set1_ps(3.9192059040069580078125f), r), (__m512i)x, _mm512_set1_epi32(0)); \
+    x_erf = (__m512)_mm512_or_epi32(_mm512_and_epi32 ((__m512i)x_erf, _mm512_set1_epi32(~(0x7FFFFFFF))), (__m512i)x);
+
+#endif // AOCL_LPGEMM_MATH_UTILS_AVX512_H
diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c
new file mode 100644
index 0000000000..a2e487bcb3
--- /dev/null
+++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_6x64rowmajor_s8_amd512vnni.c
@@ -0,0 +1,1380 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "../u8s8s32/lpgemm_s32_kern_macros.h"
+#include "../u8s8s32/lpgemm_s32_memcpy_macros.h"
+
+// 6x64 int8o32 kernel
+LPGEMM_MAIN_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x64_DISABLE,
+						  &&POST_OPS_BIAS_6x64,
+						  &&POST_OPS_RELU_6x64,
+						  &&POST_OPS_RELU_SCALE_6x64,
+						  &&POST_OPS_GELU_TANH_6x64,
+						  &&POST_OPS_GELU_ERF_6x64,
+						  &&POST_OPS_CLIP_6x64,
+						  &&POST_OPS_DOWNSCALE_6x64
+						};
+
+	dim_t MR = 6;
+	dim_t NR = 64;
+
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	if ( n0 < NR )
+	{
+		dim_t n0_rem = n0 % 16;
+
+		// Split into multiple smaller fringe kernels, so as to maximize
+		// vectorization. Any n0 < NR(64) can be expressed as n0 = 48 + n`
+		// or n0 = 32 + n` or n0 = 16 + n`, where n` < 16.
+		dim_t n0_48 = n0 / 48;
+		dim_t n0_32 = n0 / 32;
+		dim_t n0_16 = n0 / 16;
+
+		// KC when not multiple of 4 will have padding to make it multiple of
+		// 4 in packed buffer. Also the k0 cannot be passed as the updated
+		// value since A matrix is not packed and requires original k0.
+		dim_t k0_updated = k0;
+		if ( k_partial_pieces > 0 )
+		{
+			k0_updated += ( 4 - k_partial_pieces );
+		}
+
+		if ( n0_48 == 1 )
+		{
+			lpgemm_rowvar_s8s8s32os32_6x48
+			(
+			  m0, k0,
+			  a, rs_a, cs_a, ps_a,
+			  b, ( ( rs_b / 4 ) * 3 ), cs_b,
+			  c, rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+
+			b = b + ( 48 * k0_updated ); // k0x48 packed contiguosly.
+			c = c + 48;
+			post_ops_attr.post_op_c_j += 48;
+			post_ops_attr.b_sum_offset += 48;
+		}
+		else if ( n0_32 == 1 )
+		{
+			lpgemm_rowvar_s8s8s32os32_6x32
+			(
+			  m0, k0,
+			  a, rs_a, cs_a, ps_a,
+			  b, ( ( rs_b / 4 ) * 2 ), cs_b,
+			  c, rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+
+			b = b + ( 32 * k0_updated ); // k0x32 packed contiguosly.
+			c = c + 32;
+			post_ops_attr.post_op_c_j += 32;
+			post_ops_attr.b_sum_offset += 32;
+		}
+		else if ( n0_16 == 1 )
+		{
+			lpgemm_rowvar_s8s8s32os32_6x16
+			(
+			  m0, k0,
+			  a, rs_a, cs_a, ps_a,
+			  b, ( ( rs_b / 4 ) * 1 ), cs_b,
+			  c, rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+
+			b = b + ( 16 * k0_updated ); // k0x16 packed contiguosly.
+			c = c + 16;
+			post_ops_attr.post_op_c_j += 16;
+			post_ops_attr.b_sum_offset += 16;
+		}
+
+		if ( n0_rem > 0 )
+		{
+			lpgemm_rowvar_s8s8s32os32_6xlt16
+			(
+			  m0, k0,
+			  a, rs_a, cs_a, ps_a,
+			  b, ( ( rs_b / 4 ) * 1 ), cs_b,
+			  c, rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+
+			// No leftover fringe after this point.
+		}
+
+		return;
+	}
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+		__m512i c_int32_0p1 = _mm512_setzero_epi32();
+		__m512i c_int32_0p2 = _mm512_setzero_epi32();
+		__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+		__m512i c_int32_1p1 = _mm512_setzero_epi32();
+		__m512i c_int32_1p2 = _mm512_setzero_epi32();
+		__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+		__m512i c_int32_2p1 = _mm512_setzero_epi32();
+		__m512i c_int32_2p2 = _mm512_setzero_epi32();
+		__m512i c_int32_2p3 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+		__m512i c_int32_3p1 = _mm512_setzero_epi32();
+		__m512i c_int32_3p2 = _mm512_setzero_epi32();
+		__m512i c_int32_3p3 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+		__m512i c_int32_4p1 = _mm512_setzero_epi32();
+		__m512i c_int32_4p2 = _mm512_setzero_epi32();
+		__m512i c_int32_4p3 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+		__m512i c_int32_5p1 = _mm512_setzero_epi32();
+		__m512i c_int32_5p2 = _mm512_setzero_epi32();
+		__m512i c_int32_5p3 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// The instructions are arranged in a mixed way to reduce data
+			// chain dependencies.
+
+			// Load 4 rows with 64 elements each from B to 4 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+			b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+			b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_1 = _mm512_add_epi8 (a_int32_1, vec_uint8);
+
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+			c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+			c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_1 = _mm512_add_epi8 (a_int32_1, vec_uint8);
+
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+			c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+			c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_1 = _mm512_add_epi8 (a_int32_1, vec_uint8);
+
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+			c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-63] = a[5,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_1, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_1, b1 );
+			c_int32_5p2 = _mm512_dpbusd_epi32( c_int32_5p2, a_int32_1, b2 );
+			c_int32_5p3 = _mm512_dpbusd_epi32( c_int32_5p3, a_int32_1, b3 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+			b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+			b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_1 = _mm512_add_epi8 (a_int32_1, vec_uint8);
+
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+			c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+			c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_1 = _mm512_add_epi8 (a_int32_1, vec_uint8);
+
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+			c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+			c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_1 = _mm512_add_epi8 (a_int32_1, vec_uint8);
+
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+			c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-63] = a[5,kr:kr+4]*b[kr:kr+4,0-63]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_1, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_1, b1 );
+			c_int32_5p2 = _mm512_dpbusd_epi32( c_int32_5p2, a_int32_1, b2 );
+			c_int32_5p3 = _mm512_dpbusd_epi32( c_int32_5p3, a_int32_1, b3 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate 
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			b0 = _mm512_loadu_si512( bsumptr );
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+			c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+			c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+			c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+			c_int32_5p0 = _mm512_sub_epi32( c_int32_5p0 , b0 );
+
+			b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+			c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+			c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+			c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+			c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+			c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 );
+			c_int32_5p1 = _mm512_sub_epi32( c_int32_5p1 , b0 );
+
+			b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+			c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+			c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+			c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 );
+			c_int32_3p2 = _mm512_sub_epi32( c_int32_3p2 , b0 );
+			c_int32_4p2 = _mm512_sub_epi32( c_int32_4p2 , b0 );
+			c_int32_5p2 = _mm512_sub_epi32( c_int32_5p2 , b0 );
+
+			b0 = _mm512_loadu_si512( bsumptr + 48 );
+
+			c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 );
+			c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 );
+			c_int32_2p3 = _mm512_sub_epi32( c_int32_2p3 , b0 );
+			c_int32_3p3 = _mm512_sub_epi32( c_int32_3p3 , b0 );
+			c_int32_4p3 = _mm512_sub_epi32( c_int32_4p3 , b0 );
+			c_int32_5p3 = _mm512_sub_epi32( c_int32_5p3 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+			c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+			c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+			c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+			c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+			c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+			c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+			c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+			c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+			c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+			c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+			c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+			c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+			c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+			c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
+			c_int32_4p3 = _mm512_mullo_epi32( selector1, c_int32_4p3 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+			c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
+			c_int32_5p2 = _mm512_mullo_epi32( selector1, c_int32_5p2 );
+			c_int32_5p3 = _mm512_mullo_epi32( selector1, c_int32_5p3 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			// For the downscaled api (C-s8), the output C matrix values needs
+			// to be upscaled to s32 to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,5,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,5,selector1,selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x64:
+		{
+			selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+			a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+			// c[0,32-47]
+			c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+			// c[0,48-63]
+			c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[1, 16-31]
+			c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+			// c[1,32-47]
+			c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+			// c[1,48-63]
+			c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[2, 16-31]
+			c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+			// c[2,32-47]
+			c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+			// c[2,48-63]
+			c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[3, 16-31]
+			c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+			// c[3,32-47]
+			c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+			// c[3,48-63]
+			c_int32_3p3 = _mm512_add_epi32( a_int32_1, c_int32_3p3 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[4, 16-31]
+			c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+			// c[4,32-47]
+			c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
+
+			// c[4,48-63]
+			c_int32_4p3 = _mm512_add_epi32( a_int32_1, c_int32_4p3 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			// c[5, 16-31]
+			c_int32_5p1 = _mm512_add_epi32( selector2, c_int32_5p1 );
+
+			// c[5,32-47]
+			c_int32_5p2 = _mm512_add_epi32( a_int32_0, c_int32_5p2 );
+
+			// c[5,48-63]
+			c_int32_5p3 = _mm512_add_epi32( a_int32_1, c_int32_5p3 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x64:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+			// c[0,32-47]
+			c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+			// c[0,48-63]
+			c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[1,16-31]
+			c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+			// c[1,32-47]
+			c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+			// c[1,48-63]
+			c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[2,16-31]
+			c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+			// c[2,32-47]
+			c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+			// c[2,48-63]
+			c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[3,16-31]
+			c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+			// c[3,32-47]
+			c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+			// c[3,48-63]
+			c_int32_3p3 = _mm512_max_epi32( selector1, c_int32_3p3 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[4,16-31]
+			c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+			// c[4,32-47]
+			c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
+
+			// c[4,48-63]
+			c_int32_4p3 = _mm512_max_epi32( selector1, c_int32_4p3 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			// c[5,16-31]
+			c_int32_5p1 = _mm512_max_epi32( selector1, c_int32_5p1 );
+
+			// c[5,32-47]
+			c_int32_5p2 = _mm512_max_epi32( selector1, c_int32_5p2 );
+
+			// c[5,48-63]
+			c_int32_5p3 = _mm512_max_epi32( selector1, c_int32_5p3 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x64:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+			// c[0, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+			// c[0, 48-63]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+			// c[1, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+			// c[1, 48-63]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+			// c[2, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+			// c[2, 48-63]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+			// c[3, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+			// c[3, 48-63]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p3)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+			// c[4, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
+
+			// c[4, 48-63]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p3)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p1)
+
+			// c[5, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p2)
+
+			// c[5, 48-63]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p3)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x64:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 48-63]
+			GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 48-63]
+			GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 48-63]
+			GELU_TANH_S32_AVX512(c_int32_2p3, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 48-63]
+			GELU_TANH_S32_AVX512(c_int32_3p3, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_4p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 48-63]
+			GELU_TANH_S32_AVX512(c_int32_4p3, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_5p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_5p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 48-63]
+			GELU_TANH_S32_AVX512(c_int32_5p3, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x64:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+			// c[0, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+			// c[0, 48-63]
+			GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+			// c[1, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+			// c[1, 48-63]
+			GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+			// c[2, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+			// c[2, 48-63]
+			GELU_ERF_S32_AVX512(c_int32_2p3, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+			// c[3, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+			// c[3, 48-63]
+			GELU_ERF_S32_AVX512(c_int32_3p3, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+			// c[4, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_4p2, y, r, x, x_erf)
+
+			// c[4, 48-63]
+			GELU_ERF_S32_AVX512(c_int32_4p3, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_5p1, y, r, x, x_erf)
+
+			// c[5, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_5p2, y, r, x, x_erf)
+
+			// c[5, 48-63]
+			GELU_ERF_S32_AVX512(c_int32_5p3, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x64:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+			// c[0, 32-47]
+			CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+			// c[0, 48-63]
+			CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+			// c[1, 32-47]
+			CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+			// c[1, 48-63]
+			CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+			// c[2, 32-47]
+			CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+			// c[2, 48-63]
+			CLIP_S32_AVX512(c_int32_2p3, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+			// c[3, 32-47]
+			CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+			// c[3, 48-63]
+			CLIP_S32_AVX512(c_int32_3p3, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+			// c[4, 32-47]
+			CLIP_S32_AVX512(c_int32_4p2, min, max)
+
+			// c[4, 48-63]
+			CLIP_S32_AVX512(c_int32_4p3, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_S32_AVX512(c_int32_5p1, min, max)
+
+			// c[5, 32-47]
+			CLIP_S32_AVX512(c_int32_5p2, min, max)
+
+			// c[5, 48-63]
+			CLIP_S32_AVX512(c_int32_5p3, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x64:
+		{
+			selector1 =
+				_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+						post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+				_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+						post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			a_int32_0 =
+				_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+						post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+			a_int32_1 =
+				_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+						post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+			// c[0, 16-31]
+			CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+			// c[0, 32-47]
+			CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+			// c[0, 48-63]
+			CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+			// c[1, 16-31]
+			CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+			// c[1, 32-47]
+			CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+			// c[1, 48-63]
+			CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+			// c[2, 16-31]
+			CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+			// c[2, 32-47]
+			CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+			// c[2, 48-63]
+			CVT_MULRND_CVT32(c_int32_2p3,a_int32_1);
+
+			// c[3, 0-15]
+			CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+			// c[3, 16-31]
+			CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+			// c[3, 32-47]
+			CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+			// c[3, 48-63]
+			CVT_MULRND_CVT32(c_int32_3p3,a_int32_1);
+
+			// c[4, 0-15]
+			CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+			// c[4, 16-31]
+			CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+			// c[4, 32-47]
+			CVT_MULRND_CVT32(c_int32_4p2,a_int32_0);
+
+			// c[4, 48-63]
+			CVT_MULRND_CVT32(c_int32_4p3,a_int32_1);
+
+			// c[5, 0-15]
+			CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+			// c[5, 16-31]
+			CVT_MULRND_CVT32(c_int32_5p1,selector2);
+
+			// c[5, 32-47]
+			CVT_MULRND_CVT32(c_int32_5p2,a_int32_0);
+
+			// c[5, 48-63]
+			CVT_MULRND_CVT32(c_int32_5p3,a_int32_1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_6x64_DISABLE:
+		;
+
+		// Case where the output C matrix is s8 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[0,16-31]
+			CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+			// c[0,32-47]
+			CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+			// c[0,48-63]
+			CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[1,16-31]
+			CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+			// c[1,32-47]
+			CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+			// c[1,48-63]
+			CVT_STORE_S32_S8(c_int32_1p3,1,3);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[2,16-31]
+			CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+			// c[2,32-47]
+			CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+			// c[2,48-63]
+			CVT_STORE_S32_S8(c_int32_2p3,2,3);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[3,16-31]
+			CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+			// c[3,32-47]
+			CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+			// c[3,48-63]
+			CVT_STORE_S32_S8(c_int32_3p3,3,3);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[4,16-31]
+			CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+			// c[4,32-47]
+			CVT_STORE_S32_S8(c_int32_4p2,4,2);
+
+			// c[4,48-63]
+			CVT_STORE_S32_S8(c_int32_4p3,4,3);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[5,16-31]
+			CVT_STORE_S32_S8(c_int32_5p1,5,1);
+
+			// c[5,32-47]
+			CVT_STORE_S32_S8(c_int32_5p2,5,2);
+
+			// c[5,48-63]
+			CVT_STORE_S32_S8(c_int32_5p3,5,3);
+		}
+		// Case where the output C matrix is s32 or is the temp buffer used to
+		// store intermediate s32 accumulated values for downscaled (C-s8) api.
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
+
+			// c[0,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 );
+
+			// c[0,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_int32_0p3 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
+
+			// c[1,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 );
+
+			// c[1,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_int32_1p3 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
+
+			// c[2,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 );
+
+			// c[2,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_int32_2p3 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
+
+			// c[3,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 );
+
+			// c[3,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_int32_3p3 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
+
+			// c[4,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 );
+
+			// c[4,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_int32_4p3 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
+
+			// c[5,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 );
+
+			// c[5,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_int32_5p3 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			// In cases where A matrix is packed cs_a is set to 24, since the
+			// next column in a given row is accessed after 4*6 elements, where
+			// 6 is MR and 4 elements are broadcasted each time from A (vnni).
+			// In fringe case, where m < MR, the next column will be after m'*4
+			// elements, and subsequently following adjustment of cs_a is
+			// required before calling m fringe kernels.
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_s8s8s32os32_5x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_s8s8s32os32_4x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_s8s8s32os32_3x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_s8s8s32os32_2x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_s8s8s32os32_1x64
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+#endif
diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c
new file mode 100644
index 0000000000..a338484df6
--- /dev/null
+++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_m_fringe_s8_amd512vnni.c
@@ -0,0 +1,3349 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "../u8s8s32/lpgemm_s32_kern_macros.h"
+#include "../u8s8s32/lpgemm_s32_memcpy_macros.h"
+
+// 5x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x64_DISABLE,
+						  &&POST_OPS_BIAS_5x64,
+						  &&POST_OPS_RELU_5x64,
+						  &&POST_OPS_RELU_SCALE_5x64,
+						  &&POST_OPS_GELU_TANH_5x64,
+						  &&POST_OPS_GELU_ERF_5x64,
+						  &&POST_OPS_CLIP_5x64,
+						  &&POST_OPS_DOWNSCALE_5x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+	__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+	__m512i c_int32_2p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+	__m512i c_int32_3p2 = _mm512_setzero_epi32();
+	__m512i c_int32_3p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_4p0 = _mm512_setzero_epi32();
+	__m512i c_int32_4p1 = _mm512_setzero_epi32();
+	__m512i c_int32_4p2 = _mm512_setzero_epi32();
+	__m512i c_int32_4p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+		c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+		c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
+	}
+
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+		c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+		c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+		c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+		c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+		c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 );
+		c_int32_3p2 = _mm512_sub_epi32( c_int32_3p2 , b0 );
+		c_int32_4p2 = _mm512_sub_epi32( c_int32_4p2 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 48 );
+
+		c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 );
+		c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 );
+		c_int32_2p3 = _mm512_sub_epi32( c_int32_2p3 , b0 );
+		c_int32_3p3 = _mm512_sub_epi32( c_int32_3p3 , b0 );
+		c_int32_4p3 = _mm512_sub_epi32( c_int32_4p3 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+		c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+		c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
+
+		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+		c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
+		c_int32_4p3 = _mm512_mullo_epi32( selector1, c_int32_4p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,4,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,4,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+		// c[3,48-63]
+		c_int32_3p3 = _mm512_add_epi32( a_int32_1, c_int32_3p3 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+		// c[4, 16-31]
+		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+		// c[4,32-47]
+		c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
+
+		// c[4,48-63]
+		c_int32_4p3 = _mm512_add_epi32( a_int32_1, c_int32_4p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+		// c[3,48-63]
+		c_int32_3p3 = _mm512_max_epi32( selector1, c_int32_3p3 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+		// c[4,16-31]
+		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+		// c[4,32-47]
+		c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
+
+		// c[4,48-63]
+		c_int32_4p3 = _mm512_max_epi32( selector1, c_int32_4p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+		// c[3, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p3)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+		// c[4, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
+
+		// c[4, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_2p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_3p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_4p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_4p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_2p3, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+		// c[3, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_3p3, y, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+		// c[4, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_4p2, y, r, x, x_erf)
+
+		// c[4, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_4p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_S32_AVX512(c_int32_2p3, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+		// c[3, 48-63]
+		CLIP_S32_AVX512(c_int32_3p3, min, max)
+
+		// c[4, 0-15]
+		CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+		// c[4, 32-47]
+		CLIP_S32_AVX512(c_int32_4p2, min, max)
+
+		// c[4, 48-63]
+		CLIP_S32_AVX512(c_int32_4p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[1, 48-63]
+		CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[2, 48-63]
+		CVT_MULRND_CVT32(c_int32_2p3,a_int32_1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		// c[3, 48-63]
+		CVT_MULRND_CVT32(c_int32_3p3,a_int32_1);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[4, 32-47]
+		CVT_MULRND_CVT32(c_int32_4p2,a_int32_0);
+
+		// c[4, 48-63]
+		CVT_MULRND_CVT32(c_int32_4p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[1,48-63]
+		CVT_STORE_S32_S8(c_int32_1p3,1,3);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[2,48-63]
+		CVT_STORE_S32_S8(c_int32_2p3,2,3);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[3,32-47]
+		CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+		// c[3,48-63]
+		CVT_STORE_S32_S8(c_int32_3p3,3,3);
+
+		// c[4,0-15]
+		CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+		// c[4,16-31]
+		CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+		// c[4,32-47]
+		CVT_STORE_S32_S8(c_int32_4p2,4,2);
+
+		// c[4,48-63]
+		CVT_STORE_S32_S8(c_int32_4p3,4,3);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
+
+		// c[3,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 );
+
+		// c[4,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
+
+		// c[4,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 );
+
+		// c[4,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 3*16 ), c_int32_4p3 );
+	}
+}
+
+// 4x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x64_DISABLE,
+						  &&POST_OPS_BIAS_4x64,
+						  &&POST_OPS_RELU_4x64,
+						  &&POST_OPS_RELU_SCALE_4x64,
+						  &&POST_OPS_GELU_TANH_4x64,
+						  &&POST_OPS_GELU_ERF_4x64,
+						  &&POST_OPS_CLIP_4x64,
+						  &&POST_OPS_DOWNSCALE_4x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+	__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+	__m512i c_int32_2p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+	__m512i c_int32_3p2 = _mm512_setzero_epi32();
+	__m512i c_int32_3p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+       	 	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+	}
+
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+		c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+		c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+		c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 );
+		c_int32_3p2 = _mm512_sub_epi32( c_int32_3p2 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 48 );
+
+		c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 );
+		c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 );
+		c_int32_2p3 = _mm512_sub_epi32( c_int32_2p3 , b0 );
+		c_int32_3p3 = _mm512_sub_epi32( c_int32_3p3 , b0 );
+
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+		c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+		c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,3,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,3,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+		// c[3,48-63]
+		c_int32_3p3 = _mm512_add_epi32( a_int32_1, c_int32_3p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+		// c[3,48-63]
+		c_int32_3p3 = _mm512_max_epi32( selector1, c_int32_3p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+		// c[3, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_2p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_3p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_2p3, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+		// c[3, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_3p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_S32_AVX512(c_int32_2p3, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+		// c[3, 48-63]
+		CLIP_S32_AVX512(c_int32_3p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[1, 48-63]
+		CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[2, 48-63]
+		CVT_MULRND_CVT32(c_int32_2p3,a_int32_1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		// c[3, 48-63]
+		CVT_MULRND_CVT32(c_int32_3p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[1,48-63]
+		CVT_STORE_S32_S8(c_int32_1p3,1,3);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[2,48-63]
+		CVT_STORE_S32_S8(c_int32_2p3,2,3);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[3,32-47]
+		CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+		// c[3,48-63]
+		CVT_STORE_S32_S8(c_int32_3p3,3,3);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
+
+		// c[3,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 );
+	}
+}
+
+// 3x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x64_DISABLE,
+						  &&POST_OPS_BIAS_3x64,
+						  &&POST_OPS_RELU_3x64,
+						  &&POST_OPS_RELU_SCALE_3x64,
+						  &&POST_OPS_GELU_TANH_3x64,
+						  &&POST_OPS_GELU_ERF_3x64,
+						  &&POST_OPS_CLIP_3x64,
+						  &&POST_OPS_DOWNSCALE_3x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+	__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+	__m512i c_int32_2p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a *  0 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+	}
+
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+		c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+		c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 48 );
+
+		c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 );
+		c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 );
+		c_int32_2p3 = _mm512_sub_epi32( c_int32_2p3 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+		c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,2,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,2,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_2p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_2p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_S32_AVX512(c_int32_2p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[1, 48-63]
+		CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[2, 48-63]
+		CVT_MULRND_CVT32(c_int32_2p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[1,48-63]
+		CVT_STORE_S32_S8(c_int32_1p3,1,3);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[2,48-63]
+		CVT_STORE_S32_S8(c_int32_2p3,2,3);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
+	}
+}
+
+// 2x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x64_DISABLE,
+						  &&POST_OPS_BIAS_2x64,
+						  &&POST_OPS_RELU_2x64,
+						  &&POST_OPS_RELU_SCALE_2x64,
+						  &&POST_OPS_GELU_TANH_2x64,
+						  &&POST_OPS_GELU_ERF_2x64,
+						  &&POST_OPS_CLIP_2x64,
+						  &&POST_OPS_DOWNSCALE_2x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+	__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_1 = _mm512_add_epi8( a_int32_1, vec_uint8 );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+	}
+
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+		c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 48 );
+
+		c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 );
+		c_int32_1p3 = _mm512_sub_epi32( c_int32_1p3 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,1,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,1,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[1, 48-63]
+		CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[1,48-63]
+		CVT_STORE_S32_S8(c_int32_1p3,1,3);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
+	}
+}
+
+// 1x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x64_DISABLE,
+						  &&POST_OPS_BIAS_1x64,
+						  &&POST_OPS_RELU_1x64,
+						  &&POST_OPS_RELU_SCALE_1x64,
+						  &&POST_OPS_GELU_TANH_1x64,
+						  &&POST_OPS_GELU_ERF_1x64,
+						  &&POST_OPS_CLIP_1x64,
+						  &&POST_OPS_DOWNSCALE_1x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	//  Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr]
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+                // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        	//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+                // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 48 );
+
+		c_int32_0p3 = _mm512_sub_epi32( c_int32_0p3 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0)
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+	}
+	else
+	{
+		// Store the accumulated results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+	}
+}
+#endif
diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c
new file mode 100644
index 0000000000..c009bdeaf3
--- /dev/null
+++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_mn_fringe_s8_amd512vnni.c
@@ -0,0 +1,8056 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "../u8s8s32/lpgemm_s32_kern_macros.h"
+#include "../u8s8s32/lpgemm_s32_memcpy_macros.h"
+
+// 5xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5xLT16_DISABLE,
+						  &&POST_OPS_BIAS_5xLT16,
+						  &&POST_OPS_RELU_5xLT16,
+						  &&POST_OPS_RELU_SCALE_5xLT16,
+						  &&POST_OPS_GELU_TANH_5xLT16,
+						  &&POST_OPS_GELU_ERF_5xLT16,
+						  &&POST_OPS_CLIP_5xLT16,
+						  &&POST_OPS_DOWNSCALE_5xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			__m512i b0 = _mm512_loadu_si512( bsumptr );
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+			c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+			c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+			c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_3p0, 3, 0, \
+								selector1, selector2 );
+
+				// c[4,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_4p0, 4, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, 0, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, 0, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_3p0, 0, 3, 0, \
+								selector1, selector2);
+
+				// c[4,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_4p0, 0, 4, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_5xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_5xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_5xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_5xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_5xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_5xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			// c[3, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1);
+
+			// c[4, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_5xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 1 ), load_mask, c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 2 ), load_mask, c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 3 ), load_mask, c_int32_3p0 );
+
+			// c[4,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 4 ), load_mask, c_int32_4p0 );
+		}
+	}
+}
+
+// 4xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4xLT16_DISABLE,
+						  &&POST_OPS_BIAS_4xLT16,
+						  &&POST_OPS_RELU_4xLT16,
+						  &&POST_OPS_RELU_SCALE_4xLT16,
+						  &&POST_OPS_GELU_TANH_4xLT16,
+						  &&POST_OPS_GELU_ERF_4xLT16,
+						  &&POST_OPS_CLIP_4xLT16,
+						  &&POST_OPS_DOWNSCALE_4xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			__m512i b0 = _mm512_loadu_si512( bsumptr );
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+			c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+			c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_3p0, 3, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, 0, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, 0, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_3p0, 0, 3, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_4xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_4xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_4xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_4xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_4xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_4xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			// c[3, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_4xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 1 ), load_mask, c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 2 ), load_mask, c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 3 ), load_mask, c_int32_3p0 );
+		}
+	}
+}
+
+// 3xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3xLT16_DISABLE,
+						  &&POST_OPS_BIAS_3xLT16,
+						  &&POST_OPS_RELU_3xLT16,
+						  &&POST_OPS_RELU_SCALE_3xLT16,
+						  &&POST_OPS_GELU_TANH_3xLT16,
+						  &&POST_OPS_GELU_ERF_3xLT16,
+						  &&POST_OPS_CLIP_3xLT16,
+						  &&POST_OPS_DOWNSCALE_3xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			__m512i b0 = _mm512_loadu_si512( bsumptr );
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+			c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, 0, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, 0, 2, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_3xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_3xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_3xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_3xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_3xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_3xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_3xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 1 ), load_mask, c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 2 ), load_mask, c_int32_2p0 );
+		}
+	}
+}
+
+// 2xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2xLT16_DISABLE,
+						  &&POST_OPS_BIAS_2xLT16,
+						  &&POST_OPS_RELU_2xLT16,
+						  &&POST_OPS_RELU_SCALE_2xLT16,
+						  &&POST_OPS_GELU_TANH_2xLT16,
+						  &&POST_OPS_GELU_ERF_2xLT16,
+						  &&POST_OPS_CLIP_2xLT16,
+						  &&POST_OPS_DOWNSCALE_2xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			__m512i b0 = _mm512_loadu_si512( bsumptr);
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, 0, 1, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_2xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_2xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_2xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_2xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_2xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_2xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_2xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 1 ), load_mask, c_int32_1p0 );
+		}
+	}
+}
+
+// 1xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1xLT16_DISABLE,
+						  &&POST_OPS_BIAS_1xLT16,
+						  &&POST_OPS_RELU_1xLT16,
+						  &&POST_OPS_RELU_SCALE_1xLT16,
+						  &&POST_OPS_GELU_TANH_1xLT16,
+						  &&POST_OPS_GELU_ERF_1xLT16,
+						  &&POST_OPS_CLIP_1xLT16,
+						  &&POST_OPS_DOWNSCALE_1xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			__m512i b0 = _mm512_loadu_si512( bsumptr);
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_1xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_1xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_1xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_1xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_1xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_1xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_1xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+		}
+	}
+}
+
+// 5x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x16_DISABLE,
+						  &&POST_OPS_BIAS_5x16,
+						  &&POST_OPS_RELU_5x16,
+						  &&POST_OPS_RELU_SCALE_5x16,
+						  &&POST_OPS_GELU_TANH_5x16,
+						  &&POST_OPS_GELU_ERF_5x16,
+						  &&POST_OPS_CLIP_5x16,
+						  &&POST_OPS_DOWNSCALE_5x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		__m512i b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+		c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S8_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S8_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+
+			// c[3:0-15]
+			S8_S32_BETA_OP(c_int32_3p0,0,3,0,selector1,selector2);
+
+			// c[4:0-15]
+			S8_S32_BETA_OP(c_int32_4p0,0,4,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S32_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S32_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+
+			// c[3:0-15]
+			S32_S32_BETA_OP(c_int32_3p0,0,3,0,selector1,selector2);
+
+			// c[4:0-15]
+			S32_S32_BETA_OP(c_int32_4p0,0,4,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[4, 0-15]
+		CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[4,0-15]
+		CVT_STORE_S32_S8(c_int32_4p0,4,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[4,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
+	}
+}
+
+// 4x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x16_DISABLE,
+						  &&POST_OPS_BIAS_4x16,
+						  &&POST_OPS_RELU_4x16,
+						  &&POST_OPS_RELU_SCALE_4x16,
+						  &&POST_OPS_GELU_TANH_4x16,
+						  &&POST_OPS_GELU_ERF_4x16,
+						  &&POST_OPS_CLIP_4x16,
+						  &&POST_OPS_DOWNSCALE_4x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+        //convert signed int8 to uint8 for VNNI
+        a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		__m512i b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S8_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S8_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+
+			// c[3:0-15]
+			S8_S32_BETA_OP(c_int32_3p0,0,3,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S32_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S32_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+
+			// c[3:0-15]
+			S32_S32_BETA_OP(c_int32_3p0,0,3,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+	}
+}
+
+// 3x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x16_DISABLE,
+						  &&POST_OPS_BIAS_3x16,
+						  &&POST_OPS_RELU_3x16,
+						  &&POST_OPS_RELU_SCALE_3x16,
+						  &&POST_OPS_GELU_TANH_3x16,
+						  &&POST_OPS_GELU_ERF_3x16,
+						  &&POST_OPS_CLIP_3x16,
+						  &&POST_OPS_DOWNSCALE_3x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		__m512i b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S8_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S8_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S32_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S32_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+	}
+}
+
+// 2x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x16_DISABLE,
+						  &&POST_OPS_BIAS_2x16,
+						  &&POST_OPS_RELU_2x16,
+						  &&POST_OPS_RELU_SCALE_2x16,
+						  &&POST_OPS_GELU_TANH_2x16,
+						  &&POST_OPS_GELU_ERF_2x16,
+						  &&POST_OPS_CLIP_2x16,
+						  &&POST_OPS_DOWNSCALE_2x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		__m512i b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S8_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S32_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+	}
+}
+
+// 1x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x16_DISABLE,
+						  &&POST_OPS_BIAS_1x16,
+						  &&POST_OPS_RELU_1x16,
+						  &&POST_OPS_RELU_SCALE_1x16,
+						  &&POST_OPS_GELU_TANH_1x16,
+						  &&POST_OPS_GELU_ERF_1x16,
+						  &&POST_OPS_CLIP_1x16,
+						  &&POST_OPS_DOWNSCALE_1x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		__m512i b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+	}
+}
+
+// 5x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x32_DISABLE,
+						  &&POST_OPS_BIAS_5x32,
+						  &&POST_OPS_RELU_5x32,
+						  &&POST_OPS_RELU_SCALE_5x32,
+						  &&POST_OPS_GELU_TANH_5x32,
+						  &&POST_OPS_GELU_ERF_5x32,
+						  &&POST_OPS_CLIP_5x32,
+						  &&POST_OPS_DOWNSCALE_5x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_4p0 = _mm512_setzero_epi32();
+	__m512i c_int32_4p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+		c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+		c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+		c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+
+		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S8_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S8_S32_BETA_OP2(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31]
+			S8_S32_BETA_OP2(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31]
+			S8_S32_BETA_OP2(0,4,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S32_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S32_S32_BETA_OP2(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31]
+			S32_S32_BETA_OP2(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31]
+			S32_S32_BETA_OP2(0,4,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+		// c[4, 16-31]
+		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+		// c[4,16-31]
+		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[4, 0-15]
+		CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[4,0-15]
+		CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+		// c[4,16-31]
+		CVT_STORE_S32_S8(c_int32_4p1,4,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[4,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
+	}
+}
+
+// 4x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x32_DISABLE,
+						  &&POST_OPS_BIAS_4x32,
+						  &&POST_OPS_RELU_4x32,
+						  &&POST_OPS_RELU_SCALE_4x32,
+						  &&POST_OPS_GELU_TANH_4x32,
+						  &&POST_OPS_GELU_ERF_4x32,
+						  &&POST_OPS_CLIP_4x32,
+						  &&POST_OPS_DOWNSCALE_4x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+		c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+		}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S8_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S8_S32_BETA_OP2(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31]
+			S8_S32_BETA_OP2(0,3,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S32_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S32_S32_BETA_OP2(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31]
+			S32_S32_BETA_OP2(0,3,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+	}
+}
+
+// 3x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x32_DISABLE,
+						  &&POST_OPS_BIAS_3x32,
+						  &&POST_OPS_RELU_3x32,
+						  &&POST_OPS_RELU_SCALE_3x32,
+						  &&POST_OPS_GELU_TANH_3x32,
+						  &&POST_OPS_GELU_ERF_3x32,
+						  &&POST_OPS_CLIP_3x32,
+						  &&POST_OPS_DOWNSCALE_3x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S8_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S8_S32_BETA_OP2(0,2,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S32_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S32_S32_BETA_OP2(0,2,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+	}
+}
+
+// 2x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x32_DISABLE,
+						  &&POST_OPS_BIAS_2x32,
+						  &&POST_OPS_RELU_2x32,
+						  &&POST_OPS_RELU_SCALE_2x32,
+						  &&POST_OPS_GELU_TANH_2x32,
+						  &&POST_OPS_GELU_ERF_2x32,
+						  &&POST_OPS_CLIP_2x32,
+						  &&POST_OPS_DOWNSCALE_2x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S8_S32_BETA_OP2(0,1,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S32_S32_BETA_OP2(0,1,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+	}
+}
+
+// 1x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x32_DISABLE,
+						  &&POST_OPS_BIAS_1x32,
+						  &&POST_OPS_RELU_1x32,
+						  &&POST_OPS_RELU_SCALE_1x32,
+						  &&POST_OPS_GELU_TANH_1x32,
+						  &&POST_OPS_GELU_ERF_1x32,
+						  &&POST_OPS_CLIP_1x32,
+						  &&POST_OPS_DOWNSCALE_1x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16);
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+	}
+}
+
+// 5x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_5x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x48_DISABLE,
+						  &&POST_OPS_BIAS_5x48,
+						  &&POST_OPS_RELU_5x48,
+						  &&POST_OPS_RELU_SCALE_5x48,
+						  &&POST_OPS_GELU_TANH_5x48,
+						  &&POST_OPS_GELU_ERF_5x48,
+						  &&POST_OPS_CLIP_5x48,
+						  &&POST_OPS_DOWNSCALE_5x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+	__m512i c_int32_3p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_4p0 = _mm512_setzero_epi32();
+	__m512i c_int32_4p1 = _mm512_setzero_epi32();
+	__m512i c_int32_4p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+		c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+		c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+		c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+		c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+		c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 );
+		c_int32_3p2 = _mm512_sub_epi32( c_int32_3p2 , b0 );
+		c_int32_4p2 = _mm512_sub_epi32( c_int32_4p2 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+
+		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+		c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,4,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,4,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+		// c[4, 16-31]
+		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+		// c[4,32-47]
+		c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+		// c[4,16-31]
+		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+		// c[4,32-47]
+		c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+		// c[4, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_4p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+		// c[4, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_4p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+		// c[4, 0-15]
+		CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+		// c[4, 32-47]
+		CLIP_S32_AVX512(c_int32_4p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[4, 32-47]
+		CVT_MULRND_CVT32(c_int32_4p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[3,32-47]
+		CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+		// c[4,0-15]
+		CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+		// c[4,16-31]
+		CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+		// c[4,32-47]
+		CVT_STORE_S32_S8(c_int32_4p2,4,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
+
+		// c[4,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
+
+		// c[4,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 );
+	}
+}
+
+// 4x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_4x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x48_DISABLE,
+						  &&POST_OPS_BIAS_4x48,
+						  &&POST_OPS_RELU_4x48,
+						  &&POST_OPS_RELU_SCALE_4x48,
+						  &&POST_OPS_GELU_TANH_4x48,
+						  &&POST_OPS_GELU_ERF_4x48,
+						  &&POST_OPS_CLIP_4x48,
+						  &&POST_OPS_DOWNSCALE_4x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+	__m512i c_int32_3p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+		c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+		c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+		c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+		c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 );
+		c_int32_3p2 = _mm512_sub_epi32( c_int32_3p2 , b0 );
+		}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,3,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,3,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[3,32-47]
+		CVT_STORE_S32_S8(c_int32_3p2,3,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
+	}
+}
+
+// 3x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_3x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x48_DISABLE,
+						  &&POST_OPS_BIAS_3x48,
+						  &&POST_OPS_RELU_3x48,
+						  &&POST_OPS_RELU_SCALE_3x48,
+						  &&POST_OPS_GELU_TANH_3x48,
+						  &&POST_OPS_GELU_ERF_3x48,
+						  &&POST_OPS_CLIP_3x48,
+						  &&POST_OPS_DOWNSCALE_3x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+	}
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+		c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+		c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+		c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+		c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,2,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,2,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+	}
+}
+
+// 2x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_2x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x48_DISABLE,
+						  &&POST_OPS_BIAS_2x48,
+						  &&POST_OPS_RELU_2x48,
+						  &&POST_OPS_RELU_SCALE_2x48,
+						  &&POST_OPS_GELU_TANH_2x48,
+						  &&POST_OPS_GELU_ERF_2x48,
+						  &&POST_OPS_CLIP_2x48,
+						  &&POST_OPS_DOWNSCALE_2x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+	}
+
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+		c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+		c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+		c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,1,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,1,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+	}
+}
+
+// 1x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_1x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x48_DISABLE,
+						  &&POST_OPS_BIAS_1x48,
+						  &&POST_OPS_RELU_1x48,
+						  &&POST_OPS_RELU_SCALE_1x48,
+						  &&POST_OPS_GELU_TANH_1x48,
+						  &&POST_OPS_GELU_ERF_1x48,
+						  &&POST_OPS_CLIP_1x48,
+						  &&POST_OPS_DOWNSCALE_1x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8( load_mask, ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ) );
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		//convert signed int8 to uint8 for VNNI
+		a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+	}
+
+	if ( post_ops_attr.is_last_k == 1 )
+	{
+		//Subtract B matrix sum column values to compensate
+		//for addition of 128 to A matrix elements
+
+		int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+		b0 = _mm512_loadu_si512( bsumptr);
+
+		c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 16);
+
+		c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+
+		b0 = _mm512_loadu_si512( bsumptr + 32);
+
+		c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+	}
+}
+#endif
diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c
new file mode 100644
index 0000000000..b88ef512d6
--- /dev/null
+++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_n_fringe_s8_amd512vnni.c
@@ -0,0 +1,3100 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "../u8s8s32/lpgemm_s32_kern_macros.h"
+#include "../u8s8s32/lpgemm_s32_memcpy_macros.h"
+
+// 6xlt16 int8o32 fringe kernel
+LPGEMM_N_LT_NR0_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6xLT16_DISABLE,
+						  &&POST_OPS_BIAS_6xLT16,
+						  &&POST_OPS_RELU_6xLT16,
+						  &&POST_OPS_RELU_SCALE_6xLT16,
+						  &&POST_OPS_GELU_TANH_6xLT16,
+						  &&POST_OPS_GELU_ERF_6xLT16,
+						  &&POST_OPS_CLIP_6xLT16,
+						  &&POST_OPS_DOWNSCALE_6xLT16
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 16 extended elements each from B to 1 ZMM
+			// registers. It is to be noted that the B matrix is packed for use
+			// in vnni instructions and each load to ZMM register will have 4
+			// elements along k direction and 16 elements across n directions,
+			// so 4x16 elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			b0 = _mm512_loadu_si512( bsumptr );
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+			c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+			c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+			c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+			c_int32_5p0 = _mm512_sub_epi32( c_int32_5p0 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_3p0, 3, 0, \
+								selector1, selector2 );
+
+				// c[4,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_4p0, 4, 0, \
+								selector1, selector2 );
+
+				// c[5,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_5p0, 5, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, ir, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, ir, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, ir, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_3p0, ir, 3, 0, \
+								selector1, selector2);
+
+				// c[4,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_4p0, ir, 4, 0, \
+								selector1, selector2);
+
+				// c[5,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_5p0, ir, 5, 0, \
+								selector1, selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			// c[3, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1);
+
+			// c[4, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1);
+
+			// c[5, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_6xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 0 ) ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 1 ) ), load_mask, c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 2 ) ), load_mask, c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 3 ) ), load_mask, c_int32_3p0 );
+
+			// c[4,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 4 ) ), load_mask, c_int32_4p0 );
+
+			// c[5,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 5 ) ), load_mask, c_int32_5p0 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_s8s8s32os32_5xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_s8s8s32os32_4xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_s8s8s32os32_3xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_s8s8s32os32_2xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_s8s8s32os32_1xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+
+// 6x16 int8o32 fringe kernel
+LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x16_DISABLE,
+						  &&POST_OPS_BIAS_6x16,
+						  &&POST_OPS_RELU_6x16,
+						  &&POST_OPS_RELU_SCALE_6x16,
+						  &&POST_OPS_GELU_TANH_6x16,
+						  &&POST_OPS_GELU_ERF_6x16,
+						  &&POST_OPS_CLIP_6x16,
+						  &&POST_OPS_DOWNSCALE_6x16
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 16 elements each from B to 1 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			b0 = _mm512_loadu_si512( bsumptr );
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+			c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+			c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+			c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+			c_int32_5p0 = _mm512_sub_epi32( c_int32_5p0 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15]
+				S8_S32_BETA_OP(c_int32_0p0,ir,0,0,selector1,selector2);
+
+				// c[1:0-15]
+				S8_S32_BETA_OP(c_int32_1p0,ir,1,0,selector1,selector2);
+
+				// c[2:0-15]
+				S8_S32_BETA_OP(c_int32_2p0,ir,2,0,selector1,selector2);
+
+				// c[3:0-15]
+				S8_S32_BETA_OP(c_int32_3p0,ir,3,0,selector1,selector2);
+
+				// c[4:0-15]
+				S8_S32_BETA_OP(c_int32_4p0,ir,4,0,selector1,selector2);
+
+				// c[5:0-15]
+				S8_S32_BETA_OP(c_int32_5p0,ir,5,0,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15]
+				S32_S32_BETA_OP(c_int32_0p0,ir,0,0,selector1,selector2);
+
+				// c[1:0-15]
+				S32_S32_BETA_OP(c_int32_1p0,ir,1,0,selector1,selector2);
+
+				// c[2:0-15]
+				S32_S32_BETA_OP(c_int32_2p0,ir,2,0,selector1,selector2);
+
+				// c[3:0-15]
+				S32_S32_BETA_OP(c_int32_3p0,ir,3,0,selector1,selector2);
+
+				// c[4:0-15]
+				S32_S32_BETA_OP(c_int32_4p0,ir,4,0,selector1,selector2);
+
+				// c[5:0-15]
+				S32_S32_BETA_OP(c_int32_5p0,ir,5,0,selector1,selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x16:
+		{
+			selector1 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[5, 0-15]
+		CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x16_DISABLE:
+		;
+
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+		}
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_s8s8s32os32_5x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_s8s8s32os32_4x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_s8s8s32os32_3x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_s8s8s32os32_2x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_s8s8s32os32_1x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+
+// 6x32 int8o32 fringe kernel
+LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x32_DISABLE,
+						  &&POST_OPS_BIAS_6x32,
+						  &&POST_OPS_RELU_6x32,
+						  &&POST_OPS_RELU_SCALE_6x32,
+						  &&POST_OPS_GELU_TANH_6x32,
+						  &&POST_OPS_GELU_ERF_6x32,
+						  &&POST_OPS_CLIP_6x32,
+						  &&POST_OPS_DOWNSCALE_6x32
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+		__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+		__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+		__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+		__m512i c_int32_3p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+		__m512i c_int32_4p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+		__m512i c_int32_5p1 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 32 elements each from B to 2 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( int32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-31] = a[5,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-31] = a[5,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			b0 = _mm512_loadu_si512( bsumptr );
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+			c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+			c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+			c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+			c_int32_5p0 = _mm512_sub_epi32( c_int32_5p0 , b0 );
+
+			b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+			c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+			c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+			c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+			c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+			c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 );
+			c_int32_5p1 = _mm512_sub_epi32( c_int32_5p1 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+			c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+			c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+			c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+			c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+			c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+			c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15,16-31]
+				S8_S32_BETA_OP2(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31]
+				S8_S32_BETA_OP2(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31]
+				S8_S32_BETA_OP2(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31]
+				S8_S32_BETA_OP2(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31]
+				S8_S32_BETA_OP2(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31]
+				S8_S32_BETA_OP2(ir,5,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15,16-31]
+				S32_S32_BETA_OP2(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31]
+				S32_S32_BETA_OP2(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31]
+				S32_S32_BETA_OP2(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31]
+				S32_S32_BETA_OP2(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31]
+				S32_S32_BETA_OP2(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31]
+				S32_S32_BETA_OP2(ir,5,selector1,selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x32:
+		{
+			selector1 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[1, 16-31]
+			c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[2, 16-31]
+			c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[3, 16-31]
+			c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[4, 16-31]
+			c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			// c[5, 16-31]
+			c_int32_5p1 = _mm512_add_epi32( selector2, c_int32_5p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x32:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[1,16-31]
+			c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[2,16-31]
+			c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[3,16-31]
+			c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[4,16-31]
+			c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			// c[5,16-31]
+			c_int32_5p1 = _mm512_max_epi32( selector1, c_int32_5p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x32:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p1)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x32:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_5p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x32:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_5p1, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x32:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_S32_AVX512(c_int32_5p1, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[5, 0-15]
+		CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+		// c[5, 16-31]
+		CVT_MULRND_CVT32(c_int32_5p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x32_DISABLE:
+		;
+
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[0,16-31]
+			CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[1,16-31]
+			CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[2,16-31]
+			CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[3,16-31]
+			CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[4,16-31]
+			CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[5,16-31]
+			CVT_STORE_S32_S8(c_int32_5p1,5,1);
+		}
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_s8s8s32os32_5x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_s8s8s32os32_4x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_s8s8s32os32_3x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_s8s8s32os32_2x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_s8s8s32os32_1x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+// 6x48 int8o32 fringe kernel
+LPGEMM_N_FRINGE_KERN(int8_t,int8_t,int32_t,s8s8s32os32_6x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x48_DISABLE,
+						  &&POST_OPS_BIAS_6x48,
+						  &&POST_OPS_RELU_6x48,
+						  &&POST_OPS_RELU_SCALE_6x48,
+						  &&POST_OPS_GELU_TANH_6x48,
+						  &&POST_OPS_GELU_ERF_6x48,
+						  &&POST_OPS_CLIP_6x48,
+						  &&POST_OPS_DOWNSCALE_6x48
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+    	uint8_t cvt_uint8 = 128;
+	__m512i vec_uint8 = _mm512_set1_epi8 (cvt_uint8);
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+		__m512i c_int32_0p1 = _mm512_setzero_epi32();
+		__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+		__m512i c_int32_1p1 = _mm512_setzero_epi32();
+		__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+		__m512i c_int32_2p1 = _mm512_setzero_epi32();
+		__m512i c_int32_2p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+		__m512i c_int32_3p1 = _mm512_setzero_epi32();
+		__m512i c_int32_3p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+		__m512i c_int32_4p1 = _mm512_setzero_epi32();
+		__m512i c_int32_4p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+		__m512i c_int32_5p1 = _mm512_setzero_epi32();
+		__m512i c_int32_5p2 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 48 elements each from B to 3 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+			b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-47] = a[5,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
+			c_int32_5p2 = _mm512_dpbusd_epi32( c_int32_5p2, a_int32_0, b2 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+			b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+            		//convert signed int8 to uint8 for VNNI
+			a_int32_0 = _mm512_add_epi8( a_int32_0, vec_uint8 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-47] = a[5,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
+			c_int32_5p2 = _mm512_dpbusd_epi32( c_int32_5p2, a_int32_0, b2 );
+		}
+
+		if ( post_ops_attr.is_last_k == 1 )
+		{
+			//Subtract B matrix sum column values to compensate
+			//for addition of 128 to A matrix elements
+
+			int32_t* bsumptr = post_ops_attr.b_col_sum_vec + post_ops_attr.b_sum_offset;
+
+			b0 = _mm512_loadu_si512( bsumptr );
+
+			c_int32_0p0 = _mm512_sub_epi32( c_int32_0p0 , b0 );
+			c_int32_1p0 = _mm512_sub_epi32( c_int32_1p0 , b0 );
+			c_int32_2p0 = _mm512_sub_epi32( c_int32_2p0 , b0 );
+			c_int32_3p0 = _mm512_sub_epi32( c_int32_3p0 , b0 );
+			c_int32_4p0 = _mm512_sub_epi32( c_int32_4p0 , b0 );
+			c_int32_5p0 = _mm512_sub_epi32( c_int32_5p0 , b0 );
+
+			b0 = _mm512_loadu_si512( bsumptr + 16 );
+
+			c_int32_0p1 = _mm512_sub_epi32( c_int32_0p1 , b0 );
+			c_int32_1p1 = _mm512_sub_epi32( c_int32_1p1 , b0 );
+			c_int32_2p1 = _mm512_sub_epi32( c_int32_2p1 , b0 );
+			c_int32_3p1 = _mm512_sub_epi32( c_int32_3p1 , b0 );
+			c_int32_4p1 = _mm512_sub_epi32( c_int32_4p1 , b0 );
+			c_int32_5p1 = _mm512_sub_epi32( c_int32_5p1 , b0 );
+
+			b0 = _mm512_loadu_si512( bsumptr + 32 );
+
+			c_int32_0p2 = _mm512_sub_epi32( c_int32_0p2 , b0 );
+			c_int32_1p2 = _mm512_sub_epi32( c_int32_1p2 , b0 );
+			c_int32_2p2 = _mm512_sub_epi32( c_int32_2p2 , b0 );
+			c_int32_3p2 = _mm512_sub_epi32( c_int32_3p2 , b0 );
+			c_int32_4p2 = _mm512_sub_epi32( c_int32_4p2 , b0 );
+			c_int32_5p2 = _mm512_sub_epi32( c_int32_5p2 , b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+			c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+			c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+			c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+			c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+			c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+			c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+			c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+			c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+			c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+			c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+			c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
+			c_int32_5p2 = _mm512_mullo_epi32( selector1, c_int32_5p2 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,5,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,5,selector1,selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x48:
+		{
+			selector1 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			a_int32_0 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+			// c[0,32-47]
+			c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[1, 16-31]
+			c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+			// c[1,32-47]
+			c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[2, 16-31]
+			c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+			// c[2,32-47]
+			c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[3, 16-31]
+			c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+			// c[3,32-47]
+			c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[4, 16-31]
+			c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+			// c[4,32-47]
+			c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			// c[5, 16-31]
+			c_int32_5p1 = _mm512_add_epi32( selector2, c_int32_5p1 );
+
+			// c[5,32-47]
+			c_int32_5p2 = _mm512_add_epi32( a_int32_0, c_int32_5p2 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x48:
+		{
+			//printf("relu\n");
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+			// c[0,32-47]
+			c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[1,16-31]
+			c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+			// c[1,32-47]
+			c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[2,16-31]
+			c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+			// c[2,32-47]
+			c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[3,16-31]
+			c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+			// c[3,32-47]
+			c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[4,16-31]
+			c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+			// c[4,32-47]
+			c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			// c[5,16-31]
+			c_int32_5p1 = _mm512_max_epi32( selector1, c_int32_5p1 );
+
+			// c[5,32-47]
+			c_int32_5p2 = _mm512_max_epi32( selector1, c_int32_5p2 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x48:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+			// c[0, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+			// c[1, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+			// c[2, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+			// c[3, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+			// c[4, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p1)
+
+			// c[5, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p2)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x48:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_4p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_5p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_5p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x48:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+			// c[0, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+			// c[1, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+			// c[2, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+			// c[3, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+			// c[4, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_4p2, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_5p1, y, r, x, x_erf)
+
+			// c[5, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_5p2, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x48:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+			// c[0, 32-47]
+			CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+			// c[1, 32-47]
+			CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+			// c[2, 32-47]
+			CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+			// c[3, 32-47]
+			CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+			// c[4, 32-47]
+			CLIP_S32_AVX512(c_int32_4p2, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_S32_AVX512(c_int32_5p1, min, max)
+
+			// c[5, 32-47]
+			CLIP_S32_AVX512(c_int32_5p2, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[4, 32-47]
+		CVT_MULRND_CVT32(c_int32_4p2,a_int32_0);
+
+		// c[5, 0-15]
+		CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+		// c[5, 16-31]
+		CVT_MULRND_CVT32(c_int32_5p1,selector2);
+
+		// c[5, 32-47]
+		CVT_MULRND_CVT32(c_int32_5p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x48_DISABLE:
+		;
+
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[0,16-31]
+			CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+			// c[0,32-47]
+			CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[1,16-31]
+			CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+			// c[1,32-47]
+			CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[2,16-31]
+			CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+			// c[2,32-47]
+			CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[3,16-31]
+			CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+			// c[3,32-47]
+			CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[4,16-31]
+			CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+			// c[4,32-47]
+			CVT_STORE_S32_S8(c_int32_4p2,4,2);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[5,16-31]
+			CVT_STORE_S32_S8(c_int32_5p1,5,1);
+
+			// c[5,32-47]
+			CVT_STORE_S32_S8(c_int32_5p2,5,2);
+		}
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
+
+			// c[0,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
+
+			// c[1,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
+
+			// c[2,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
+
+			// c[3,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
+
+			// c[4,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
+
+			// c[5,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_s8s8s32os32_5x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_s8s8s32os32_4x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_s8s8s32os32_3x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_s8s8s32os32_2x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_s8s8s32os32_1x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+#endif
diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_packa_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_packa_s8_amd512vnni.c
new file mode 100644
index 0000000000..cb663f7425
--- /dev/null
+++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_packa_s8_amd512vnni.c
@@ -0,0 +1,534 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#define MR 6
+#define NR 64
+
+void packa_m5_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     );
+
+void packa_m4_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     );
+
+void packa_m3_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     );
+
+void packa_m2_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     );
+
+void packa_m1_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     );
+
+// TODO: k fringe till k=4, k%4=0 and padding to make k'%4 = 0 if k%4 != 0 originally.
+void packa_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    MC,
+       const dim_t    KC,
+       dim_t*         rs_a,
+       dim_t*         cs_a
+     )
+{
+	// Used for permuting the mm512i elements for use in vpdpbusd instruction.
+	// These are indexes of the format a0-a1-b0-b1-a2-a3-b2-b3 and a0-a1-a2-a3-b0-b1-b2-b3.
+	// Adding 4 int32 wise gives format a4-a5-b4-b5-a6-a7-b6-b7 and a4-a5-a6-a7-b4-b5-b6-b7.
+	__m512i selector1 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB );
+	__m512i selector1_1 = _mm512_setr_epi64( 0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF );
+	__m512i selector2 = _mm512_setr_epi64( 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xA, 0xB );
+	__m512i selector2_1 = _mm512_setr_epi64( 0x4, 0x5, 0x6, 0x7, 0xC, 0xD, 0xE, 0xF );
+
+	// First half.
+	__m512i selector3 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x2, 0x3, 0x9, 0x4, 0x5 ); // 64 elems
+	__m512i selector4 = _mm512_setr_epi64( 0x8, 0x6, 0x7, 0x9, 0x0, 0x0, 0x0, 0x0 ); // 32 elems
+	__m512i selector5 = _mm512_setr_epi64( 0x0, 0x1, 0xA, 0x2, 0x3, 0xB, 0x4, 0x5 ); // 64 elems
+	__m512i selector6 = _mm512_setr_epi64( 0xA, 0x6, 0x7, 0xB, 0x0, 0x0, 0x0, 0x0 ); // 32 elems
+
+	// Second half.
+	__m512i selector7 = _mm512_setr_epi64( 0x0, 0x1, 0xC, 0x2, 0x3, 0xD, 0x4, 0x5 ); // 64 elems
+	__m512i selector8 = _mm512_setr_epi64( 0xC, 0x6, 0x7, 0xD, 0x0, 0x0, 0x0, 0x0 ); // 32 elems
+	__m512i selector9 = _mm512_setr_epi64( 0x0, 0x1, 0xE, 0x2, 0x3, 0xF, 0x4, 0x5 ); // 64 elems
+	__m512i selector10 = _mm512_setr_epi64( 0xE, 0x6, 0x7, 0xF, 0x0, 0x0, 0x0, 0x0 ); // 32 elems
+
+	dim_t m_full_pieces = MC / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = MC % MR;
+
+	__m512i a0;
+	__m512i b0;
+	__m512i c0;
+	__m512i d0;
+	__m512i e0;
+	__m512i f0;
+	__m512i a01;
+	__m512i c01;
+	__m512i e01;
+	__m256i last_piece;
+
+	for ( dim_t ic = 0; ic < m_full_pieces_loop_limit; ic += MR )
+	{
+		for ( dim_t kr = 0; kr < KC; kr += NR )
+		{
+			// Rearrange for vpdpbusd, read 6 rows from A with 64 elements in each row.
+			a0 = _mm512_loadu_si512( a + ( lda * ( ic + 0 ) ) + kr );
+			b0 = _mm512_loadu_si512( a + ( lda * ( ic + 1 ) ) + kr );
+			c0 = _mm512_loadu_si512( a + ( lda * ( ic + 2 ) ) + kr );
+			d0 = _mm512_loadu_si512( a + ( lda * ( ic + 3 ) ) + kr );
+			e0 = _mm512_loadu_si512( a + ( lda * ( ic + 4 ) ) + kr );
+			f0 = _mm512_loadu_si512( a + ( lda * ( ic + 5 ) ) + kr );
+
+			a01 = _mm512_unpacklo_epi32( a0, b0 );
+			a0 = _mm512_unpackhi_epi32( a0, b0 );
+
+			c01 = _mm512_unpacklo_epi32( c0, d0 );
+			c0 = _mm512_unpackhi_epi32( c0, d0 );
+
+			e01 = _mm512_unpacklo_epi32( e0, f0 ); // Elem 4
+			e0 = _mm512_unpackhi_epi32( e0, f0 ); // Elem 5
+
+			b0 = _mm512_unpacklo_epi64( a01, c01 );
+			a01 = _mm512_unpackhi_epi64( a01, c01 );
+
+			d0 = _mm512_unpacklo_epi64( a0, c0 );
+			c01 = _mm512_unpackhi_epi64( a0, c0 );
+
+			a0 = _mm512_permutex2var_epi64( b0, selector1, a01 );
+			c0 = _mm512_permutex2var_epi64( d0, selector1, c01 );
+			b0 = _mm512_permutex2var_epi64( b0, selector1_1, a01 );
+			d0 = _mm512_permutex2var_epi64( d0, selector1_1, c01 );
+
+			a01 = _mm512_permutex2var_epi64( a0, selector2, c0 ); // a[0]
+			c01 = _mm512_permutex2var_epi64( b0, selector2, d0 ); // a[2]
+			a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // a[1]
+			c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // a[3]
+
+			// First half
+			b0 = _mm512_permutex2var_epi64( a01, selector3, e01 ); // 1st 64
+			a01 = _mm512_permutex2var_epi64( a01, selector4, e0 ); // 1st 32
+			d0 = _mm512_permutex2var_epi64( a0, selector5, e01 ); // 2nd 64
+			a0 = _mm512_permutex2var_epi64( a0, selector6, e0 ); // 2nd 32
+
+			_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 0 ) ) ), b0 );
+			_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 64 ) ) ) , a01 );
+			_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 96 ) ) ), d0 );
+			// Last piece
+			last_piece = _mm512_castsi512_si256( a0 );
+			_mm256_mask_storeu_epi64
+			(
+			  pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 160 ) ) ),
+			  0xFF,
+			  last_piece
+			);
+
+			// Second half
+			b0 = _mm512_permutex2var_epi64( c01, selector7, e01 ); // 3rd 64
+			c01 = _mm512_permutex2var_epi64( c01, selector8, e0 ); // 3rd 32
+			d0 = _mm512_permutex2var_epi64( c0, selector9, e01 ); // 4th 64
+			c0 = _mm512_permutex2var_epi64( c0, selector10, e0 ); // 4th 32
+
+			_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 192 ) ) ), b0 );
+			_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 256 ) ) ) , c01 );
+			_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 288 ) ) ), d0 );
+			// Last piece
+			last_piece = _mm512_castsi512_si256( c0 );
+			_mm256_mask_storeu_epi64
+			(
+			  pack_a_buffer_s8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 352 ) ) ),
+			  0xFF,
+			  last_piece
+			);
+		}
+		//TODO: Handle kc < 64 case, 48,32,16
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			packa_m5_k64_s8s8s32os32
+			(
+			  pack_a_buffer_s8s8s32o32 +  ( m_full_pieces_loop_limit * KC ),
+			  a + ( lda * m_full_pieces_loop_limit ), lda, KC
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			packa_m4_k64_s8s8s32os32
+			(
+			  pack_a_buffer_s8s8s32o32 + ( m_full_pieces_loop_limit * KC ),
+			  a + ( lda * m_full_pieces_loop_limit ), lda, KC
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			packa_m3_k64_s8s8s32os32
+			(
+			  pack_a_buffer_s8s8s32o32 + ( m_full_pieces_loop_limit * KC ),
+			  a + ( lda * m_full_pieces_loop_limit ), lda, KC
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			packa_m2_k64_s8s8s32os32
+			(
+			  pack_a_buffer_s8s8s32o32 + ( m_full_pieces_loop_limit * KC ),
+			  a + ( lda * m_full_pieces_loop_limit ), lda, KC
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			packa_m1_k64_s8s8s32os32
+			(
+			  pack_a_buffer_s8s8s32o32 + ( m_full_pieces_loop_limit * KC ),
+			  a + ( lda * m_full_pieces_loop_limit ), lda, KC
+			);
+		}
+	}
+	*rs_a = 4;
+	*cs_a = 24;
+}
+
+void packa_m5_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     )
+{
+	// Used for permuting the mm512i elements for use in vpdpbusd instruction.
+	// These are indexes of the format a0-a1-b0-b1-a2-a3-b2-b3 and a0-a1-a2-a3-b0-b1-b2-b3.
+	// Adding 4 int32 wise gives format a4-a5-b4-b5-a6-a7-b6-b7 and a4-a5-a6-a7-b4-b5-b6-b7.
+	__m512i selector1 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB );
+	__m512i selector1_1 = _mm512_setr_epi64( 0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF );
+	__m512i selector2 = _mm512_setr_epi64( 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xA, 0xB );
+	__m512i selector2_1 = _mm512_setr_epi64( 0x4, 0x5, 0x6, 0x7, 0xC, 0xD, 0xE, 0xF );
+
+	// First half.
+	__m512i selector3 = _mm512_setr_epi32( 0x0, 0x1, 0x2, 0x3, 0x10, 0x4, 0x5, 0x6, 0x7, 0x11, 0x8, 0x9, 0xA, 0xB, 0x12, 0xC);
+	__m512i selector4 = _mm512_setr_epi32( 0xD, 0xE, 0xF, 0x13, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+	__m512i selector5 = _mm512_setr_epi32( 0x0, 0x1, 0x2, 0x3, 0x14, 0x4, 0x5, 0x6, 0x7, 0x15, 0x8, 0x9, 0xA, 0xB, 0x16, 0xC);
+	__m512i selector6 = _mm512_setr_epi32( 0xD, 0xE, 0xF, 0x17, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+
+	// Second half.
+	__m512i selector7 = _mm512_setr_epi32( 0x0, 0x1, 0x2, 0x3, 0x18, 0x4, 0x5, 0x6, 0x7, 0x19, 0x8, 0x9, 0xA, 0xB, 0x1A, 0xC);
+	__m512i selector8 = _mm512_setr_epi32( 0xD, 0xE, 0xF, 0x1B, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+	__m512i selector9 = _mm512_setr_epi32( 0x0, 0x1, 0x2, 0x3, 0x1C, 0x4, 0x5, 0x6, 0x7, 0x1D, 0x8, 0x9, 0xA, 0xB, 0x1E, 0xC);
+	__m512i selector10 = _mm512_setr_epi32( 0xD, 0xE, 0xF, 0x1F, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+	
+	__m512i a0;
+	__m512i b0;
+	__m512i c0;
+	__m512i d0;
+	__m512i e0;
+	__m512i a01;
+	__m512i c01;
+	__m128i last_piece;
+
+	for ( dim_t kr = 0; kr < KC; kr += NR )
+	{
+		// Rearrange for vpdpbusd, read 5 rows from A with 64 elements in each row.
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+		b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr );
+		c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr );
+		d0 = _mm512_loadu_si512( a + ( lda * 3 ) + kr );
+		e0 = _mm512_loadu_si512( a + ( lda * 4 ) + kr );
+
+		a01 = _mm512_unpacklo_epi32( a0, b0 );
+		a0 = _mm512_unpackhi_epi32( a0, b0 );
+
+		c01 = _mm512_unpacklo_epi32( c0, d0 );
+		c0 = _mm512_unpackhi_epi32( c0, d0 );
+
+		b0 = _mm512_unpacklo_epi64( a01, c01 );
+		a01 = _mm512_unpackhi_epi64( a01, c01 );
+
+		d0 = _mm512_unpacklo_epi64( a0, c0 );
+		c01 = _mm512_unpackhi_epi64( a0, c0 );
+
+		a0 = _mm512_permutex2var_epi64( b0, selector1, a01 );
+		c0 = _mm512_permutex2var_epi64( d0, selector1, c01 );
+		b0 = _mm512_permutex2var_epi64( b0, selector1_1, a01 );
+		d0 = _mm512_permutex2var_epi64( d0, selector1_1, c01 );
+
+		a01 = _mm512_permutex2var_epi64( a0, selector2, c0 ); // a[0]
+		c01 = _mm512_permutex2var_epi64( b0, selector2, d0 ); // a[2]
+		a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // a[1]
+		c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // a[3]
+
+		// First half
+		b0 = _mm512_permutex2var_epi32( a01, selector3, e0 );
+		a01 = _mm512_permutex2var_epi32( a01, selector4, e0 );
+		d0 = _mm512_permutex2var_epi32( a0, selector5, e0 );
+		a0 = _mm512_permutex2var_epi32( a0, selector6, e0 );
+
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 0 ) ), b0 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 64 ) ) , a01 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 80 ) ), d0 );
+		// Last piece
+		last_piece = _mm512_castsi512_si128( a0 );
+		_mm_mask_storeu_epi64
+		(
+		  pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 144 ) ),
+		  0xFF,
+		  last_piece
+		);
+
+		// Second half
+		b0 = _mm512_permutex2var_epi32( c01, selector7, e0 );
+		c01 = _mm512_permutex2var_epi32( c01, selector8, e0 );
+		d0 = _mm512_permutex2var_epi32( c0, selector9, e0 );
+		c0 = _mm512_permutex2var_epi32( c0, selector10, e0 );
+
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 160 ) ), b0 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 224 ) ) , c01 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 240 ) ), d0 );
+		// Last piece
+		last_piece = _mm512_castsi512_si128( c0 );
+		_mm_mask_storeu_epi64
+		(
+		  pack_a_buffer_s8s8s32o32 + ( ( kr * 5 ) + ( 304 ) ),
+		  0xFF,
+		  last_piece
+		);
+	}
+}
+
+void packa_m4_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     )
+{
+	// Used for permuting the mm512i elements for use in vpdpbusd instruction.
+	// These are indexes of the format a0-a1-b0-b1-a2-a3-b2-b3 and a0-a1-a2-a3-b0-b1-b2-b3.
+	// Adding 4 int32 wise gives format a4-a5-b4-b5-a6-a7-b6-b7 and a4-a5-a6-a7-b4-b5-b6-b7.
+	__m512i selector1 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB );
+	__m512i selector1_1 = _mm512_setr_epi64( 0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF );
+	__m512i selector2 = _mm512_setr_epi64( 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xA, 0xB );
+	__m512i selector2_1 = _mm512_setr_epi64( 0x4, 0x5, 0x6, 0x7, 0xC, 0xD, 0xE, 0xF );
+	
+	__m512i a0;
+	__m512i b0;
+	__m512i c0;
+	__m512i d0;
+	__m512i a01;
+	__m512i c01;
+
+	for ( dim_t kr = 0; kr < KC; kr += NR )
+	{
+		// Rearrange for vpdpbusd, read 4 rows from A with 64 elements in each row.
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+		b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr );
+		c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr );
+		d0 = _mm512_loadu_si512( a + ( lda * 3 ) + kr );
+
+		a01 = _mm512_unpacklo_epi32( a0, b0 );
+		a0 = _mm512_unpackhi_epi32( a0, b0 );
+
+		c01 = _mm512_unpacklo_epi32( c0, d0 );
+		c0 = _mm512_unpackhi_epi32( c0, d0 );
+
+		b0 = _mm512_unpacklo_epi64( a01, c01 );
+		a01 = _mm512_unpackhi_epi64( a01, c01 );
+
+		d0 = _mm512_unpacklo_epi64( a0, c0 );
+		c01 = _mm512_unpackhi_epi64( a0, c0 );
+
+		a0 = _mm512_permutex2var_epi64( b0, selector1, a01 );
+		c0 = _mm512_permutex2var_epi64( d0, selector1, c01 );
+		b0 = _mm512_permutex2var_epi64( b0, selector1_1, a01 );
+		d0 = _mm512_permutex2var_epi64( d0, selector1_1, c01 );
+
+		a01 = _mm512_permutex2var_epi64( a0, selector2, c0 ); // a[0]
+		c01 = _mm512_permutex2var_epi64( b0, selector2, d0 ); // a[2]
+		a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // a[1]
+		c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // a[3]
+
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 0 ) ), a01 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 64 ) ) , a0 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 128 ) ), c01 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 4 ) + ( 192 ) ), c0 );
+	}
+}
+
+void packa_m3_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     )
+{
+	// Used for permuting the mm512i elements for use in vpdpbusd instruction.
+	// These are indexes of the format a0-a1-b0-b1-a2-a3-b2-b3 and a0-a1-a2-a3-b0-b1-b2-b3.
+	// Adding 4 int32 wise gives format a4-a5-b4-b5-a6-a7-b6-b7 and a4-a5-a6-a7-b4-b5-b6-b7.
+	__m512i selector1 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB );
+	__m512i selector1_1 = _mm512_setr_epi64( 0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF );
+
+	// First half
+	__m512i selector3 = _mm512_setr_epi32( 0x0, 0x1, 0x10, 0x2, 0x3, 0x11, 0x4, 0x5, 0x12, 0x6, 0x7, 0x13, 0x8, 0x9, 0x14, 0xA );
+	__m512i selector4 = _mm512_setr_epi32( 0xB, 0x15, 0xC, 0xD, 0x16, 0xE, 0xF, 0x17, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 );
+
+	// Second half
+	__m512i selector5 = _mm512_setr_epi32( 0x0, 0x1, 0x18, 0x2, 0x3, 0x19, 0x4, 0x5, 0x1A, 0x6, 0x7, 0x1B, 0x8, 0x9, 0x1C, 0xA );
+	__m512i selector6 = _mm512_setr_epi32( 0xB, 0x1D, 0xC, 0xD, 0x1E, 0xE, 0xF, 0x1F, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 );
+	
+	__m512i a0;
+	__m512i b0;
+	__m512i c0;
+	__m512i a01;
+	__m256i last_piece;
+
+	for ( dim_t kr = 0; kr < KC; kr += NR )
+	{
+		// Rearrange for vpdpbusd, read 3 rows from A with 64 elements in each row.
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+		b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr );
+		c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr );
+
+		a01 = _mm512_unpacklo_epi32( a0, b0 );
+		a0 = _mm512_unpackhi_epi32( a0, b0 );
+
+		b0 = _mm512_permutex2var_epi64( a01, selector1, a0 ); // a[0]
+		a01 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); // a[1]
+
+		a0 = _mm512_permutex2var_epi32( b0, selector3, c0 );
+		b0 = _mm512_permutex2var_epi32( b0, selector4, c0 );
+
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 0 ) ), a0 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 64 ) ) , b0 );
+
+		a0 = _mm512_permutex2var_epi32( a01, selector5, c0 );
+		b0 = _mm512_permutex2var_epi32( a01, selector6, c0 );
+
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 96 ) ), a0 );
+		// Last piece
+		last_piece = _mm512_castsi512_si256( b0 );
+		_mm256_mask_storeu_epi64
+		(
+		  pack_a_buffer_s8s8s32o32 + ( ( kr * 3 ) + ( 160 ) ),
+		  0xFF,
+		  last_piece
+		);
+	}
+}
+
+void packa_m2_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     )
+{
+	// Used for permuting the mm512i elements for use in vpdpbusd instruction.
+	// These are indexes of the format a0-a1-b0-b1-a2-a3-b2-b3 and a0-a1-a2-a3-b0-b1-b2-b3.
+	// Adding 4 int32 wise gives format a4-a5-b4-b5-a6-a7-b6-b7 and a4-a5-a6-a7-b4-b5-b6-b7.
+	__m512i selector1 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB );
+	__m512i selector1_1 = _mm512_setr_epi64( 0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF );
+	
+	__m512i a0;
+	__m512i b0;
+	__m512i a01;
+
+	for ( dim_t kr = 0; kr < KC; kr += NR )
+	{
+		// Rearrange for vpdpbusd, read 2 rows from A with 64 elements in each row.
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+		b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr );
+
+		a01 = _mm512_unpacklo_epi32( a0, b0 );
+		a0 = _mm512_unpackhi_epi32( a0, b0 );
+
+		b0 = _mm512_permutex2var_epi64( a01, selector1, a0 ); // a[0]
+		a01 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); // a[1]
+
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 2 ) + ( 0 ) ), b0 );
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 2 ) + ( 64 ) ) , a01 );
+	}
+}
+
+void packa_m1_k64_s8s8s32os32
+     (
+       int8_t*       pack_a_buffer_s8s8s32o32,
+       const int8_t* a,
+       const dim_t    lda,
+       const dim_t    KC
+     )
+{
+	__m512i a0;
+
+	for ( dim_t kr = 0; kr < KC; kr += NR )
+	{
+		// Rearrange for vpdpbusd, read 1 row from A with 64 elements in each row.
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+
+		_mm512_storeu_si512( pack_a_buffer_s8s8s32o32 + ( ( kr * 1 ) + ( 0 ) ), a0 );
+	}
+}
+#endif
diff --git a/kernels/zen4/lpgemm/s8s8s32/lpgemm_packb_s8_amd512vnni.c b/kernels/zen4/lpgemm/s8s8s32/lpgemm_packb_s8_amd512vnni.c
new file mode 100644
index 0000000000..532f2c264b
--- /dev/null
+++ b/kernels/zen4/lpgemm/s8s8s32/lpgemm_packb_s8_amd512vnni.c
@@ -0,0 +1,1049 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#define NR 64
+
+void packb_nrlt16_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   KC,
+       const dim_t   n0_partial_rem
+     );
+
+void packb_nr16_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   KC
+     );
+
+void packb_nr32_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   KC
+     );
+
+void packb_nr48_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   KC
+     );
+
+void packb_nr64_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   NC,
+       const dim_t   KC,
+       dim_t*        rs_b,
+       dim_t*        cs_b
+     )
+{
+	// Used for permuting the mm512i elements for use in vpdpbusd instruction.
+	// These are indexes of the format a0-a1-b0-b1-a2-a3-b2-b3 and a0-a1-a2-a3-b0-b1-b2-b3.
+	// Adding int32 wise all4 gives format a4-a5-b4-b5-a6-a7-b6-b7 and a4-a5-a6-a7-b4-b5-b6-b7.
+	__m512i selector1 = _mm512_setr_epi64( 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xA, 0xB );
+	__m512i selector1_1 = _mm512_setr_epi64( 0x4, 0x5, 0xC, 0xD, 0x6, 0x7, 0xE, 0xF );
+
+	__m512i selector2 = _mm512_setr_epi64( 0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xA, 0xB );
+	__m512i selector2_1 = _mm512_setr_epi64( 0x4, 0x5, 0x6, 0x7, 0xC, 0xD, 0xE, 0xF );
+
+	dim_t n_full_pieces = NC / NR;
+	dim_t n_full_pieces_loop_limit = n_full_pieces * NR;
+	dim_t n_partial_pieces = NC % NR;
+
+	dim_t k_full_pieces_blks = KC / 4;
+	dim_t k_full_pieces = k_full_pieces_blks * 4;
+	dim_t k_partial_pieces = KC % 4;
+
+	// KC when not multiple of 4 will have padding to make it multiple of 4 in packed buffer.
+	dim_t KC_updated = KC;
+	if ( k_partial_pieces > 0 )
+	{
+		KC_updated += ( 4 - k_partial_pieces );
+	}
+
+    	//to compute column sum of B matrix
+    	__m512i sum1, sum2, sum3, sum4;
+	__m512i mul_128 = _mm512_set1_epi32 (7);
+
+	__m512i a0;
+	__m512i b0;
+	__m512i c0;
+	__m512i d0;
+	__m512i a01;
+	__m512i c01;
+
+	for ( dim_t jc = 0; jc < n_full_pieces_loop_limit; jc += NR )
+	{
+        	//load the temp buffer to compute column sum of B matrix
+        	sum1 = _mm512_loadu_si512( pack_b_column_sum + jc );
+		sum2 = _mm512_loadu_si512( pack_b_column_sum + 16 + jc );  //offset 16- as 16 int32 elements fit in 1 zmm register
+		sum3 = _mm512_loadu_si512( pack_b_column_sum + 32 + jc );
+		sum4 = _mm512_loadu_si512( pack_b_column_sum + 48 + jc );
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
+		{
+			// Rearrange for vpdpbusd, read 4 rows from B with 64 elements in each row.
+			a0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) + jc );
+			b0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) + jc );
+			c0 = _mm512_loadu_si512( b + ( ldb * ( kr + 2 ) ) + jc );
+			d0 = _mm512_loadu_si512( b + ( ldb * ( kr + 3 ) ) + jc );
+
+            		//add all the columns : sum = add (sum, a0, b0, c0, d0)
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+            		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 0)),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 0)),
+			_mm512_add_epi32 (_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( c0, 0)),
+			_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( d0, 0))))) , mul_128));
+
+			sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+            		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 1)),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 1)),
+			_mm512_add_epi32 (_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( c0, 1)),
+			_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( d0, 1))))) , mul_128));
+
+			sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 (
+            		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 2)),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 2)),
+			_mm512_add_epi32 (_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( c0, 2)),
+			_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( d0, 2))))) , mul_128));
+
+			sum4 = _mm512_add_epi32 ( sum4, _mm512_sllv_epi32 (
+            		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 3)),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 3)),
+			_mm512_add_epi32 (_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( c0, 3)),
+			_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( d0, 3))))), mul_128));
+
+			a01 = _mm512_unpacklo_epi8( a0, b0 );
+			a0 = _mm512_unpackhi_epi8( a0, b0 );
+
+			c01 = _mm512_unpacklo_epi8( c0, d0 );
+			c0 = _mm512_unpackhi_epi8( c0, d0 );
+
+			b0 = _mm512_unpacklo_epi16( a01, c01 );
+			a01 = _mm512_unpackhi_epi16( a01, c01 );
+
+			d0 = _mm512_unpacklo_epi16( a0, c0 );
+			c01 = _mm512_unpackhi_epi16( a0, c0 );
+
+			a0 = _mm512_permutex2var_epi64( b0, selector1, a01 );
+			c0 = _mm512_permutex2var_epi64( d0, selector1, c01 );
+			b0 = _mm512_permutex2var_epi64( b0, selector1_1, a01 );
+			d0 = _mm512_permutex2var_epi64( d0, selector1_1, c01 );
+
+			a01 = _mm512_permutex2var_epi64( a0, selector2, c0 ); // b[0]
+			c01 = _mm512_permutex2var_epi64( b0, selector2, d0 ); // b[2]
+			a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // b[1]
+			c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // b[3]
+
+			_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) ), a01 );
+			_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) ) , a0 );
+			_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 2 ) * NR ) ), c01 );
+			_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 3 ) * NR ) ), c0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			if ( k_partial_pieces == 3 )
+			{
+				a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
+				b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 1 ) ) + jc );
+				c0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 2 ) ) + jc );
+				d0 = _mm512_setzero_si512();
+
+                		//add all the columns : sum = add (sum, a0, b0, c0)
+                		sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+            			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 0)),
+				_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 0)),
+				_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( c0, 0)))), mul_128));
+
+				sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+            			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 1)),
+				_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 1)),
+				_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( c0, 1)))), mul_128));
+
+				sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 (
+            			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 2)),
+				_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 2)),
+				_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( c0, 2)))), mul_128));
+
+				sum4 = _mm512_add_epi32 ( sum4, _mm512_sllv_epi32 (
+            			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 3)),
+				_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 3)),
+				_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( c0, 3)))), mul_128));
+
+			}
+			else if( k_partial_pieces == 2 )
+			{
+				a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
+				b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 1 ) ) + jc );
+				c0 = _mm512_setzero_si512();
+				d0 = _mm512_setzero_si512();
+
+                		//add all the columns : sum = add (sum, a0, b0)
+				sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+            			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 0)),
+				_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 0))), mul_128));
+
+				sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+            			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 1)),
+				_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 1))), mul_128));
+
+				sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 (
+            			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 2)),
+				_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 2))), mul_128));
+
+				sum4 = _mm512_add_epi32 ( sum4, _mm512_sllv_epi32 (
+            			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 3)),
+				_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( b0, 3))), mul_128));
+			}
+			else //k_partial_pieces == 1
+			{
+				a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
+				b0 = _mm512_setzero_si512();
+				c0 = _mm512_setzero_si512();
+				d0 = _mm512_setzero_si512();
+
+                		//add all the columns: sum = add (sum, a0)
+				sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+            			_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 0)), mul_128));
+
+				sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+            			_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 1)), mul_128));
+
+				sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 (
+            			_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 2)), mul_128));
+
+				sum4 = _mm512_add_epi32 ( sum4, _mm512_sllv_epi32 (
+            			_mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32 ( a0, 3)), mul_128));
+			}
+
+			a01 = _mm512_unpacklo_epi8( a0, b0 );
+			a0 = _mm512_unpackhi_epi8( a0, b0 );
+
+			c01 = _mm512_unpacklo_epi8( c0, d0 );
+			c0 = _mm512_unpackhi_epi8( c0, d0 );
+
+			b0 = _mm512_unpacklo_epi16( a01, c01 );
+			a01 = _mm512_unpackhi_epi16( a01, c01 );
+
+			d0 = _mm512_unpacklo_epi16( a0, c0 );
+			c01 = _mm512_unpackhi_epi16( a0, c0 );
+
+			a0 = _mm512_permutex2var_epi64( b0, selector1, a01 );
+			c0 = _mm512_permutex2var_epi64( d0, selector1, c01 );
+			b0 = _mm512_permutex2var_epi64( b0, selector1_1, a01 );
+			d0 = _mm512_permutex2var_epi64( d0, selector1_1, c01 );
+
+			a01 = _mm512_permutex2var_epi64( a0, selector2, c0 ); // b[0]
+			c01 = _mm512_permutex2var_epi64( b0, selector2, d0 ); // b[2]
+			a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // b[1]
+			c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // b[3]
+
+			_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) ), a01 );
+			_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) ) , a0 );
+			_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 2 ) * NR ) ), c01 );
+			_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 3 ) * NR ) ), c0 );
+		}
+        	//store the sum column
+		_mm512_storeu_si512( pack_b_column_sum + jc, sum1 );
+		_mm512_storeu_si512( pack_b_column_sum + 16 + jc, sum2 );
+		_mm512_storeu_si512( pack_b_column_sum + 32 + jc, sum3 );
+		_mm512_storeu_si512( pack_b_column_sum + 48 + jc, sum4 );
+	}
+
+	// Contiguous packing of fringe panel (n` < NR).
+	if ( n_partial_pieces > 0 )
+	{
+		dim_t n0_partial_rem = n_partial_pieces % 16;
+		dim_t n0_partial_pack = 0;
+
+		// Split into multiple smaller fringe kernels, so as to maximize
+		// vectorization after packing. Any n0 < NR(64) can be expressed
+		// as n0 = 48 + n` / n0 = 32 + n` / n0 = 16 + n`, where n` < 16.
+		dim_t n0_48 = n_partial_pieces / 48;
+		dim_t n0_32 = n_partial_pieces / 32;
+		dim_t n0_16 = n_partial_pieces / 16;
+
+		if ( n0_48 == 1 )
+		{
+			packb_nr48_s8s8s32os32
+			(
+			  ( pack_b_buffer_s8s8s32o32 + ( n_full_pieces_loop_limit * KC_updated ) ),
+			  ( pack_b_column_sum + n_full_pieces_loop_limit ),
+			  ( b + n_full_pieces_loop_limit ), ldb, KC
+			);
+
+			n0_partial_pack = 48;
+		}
+		else if ( n0_32 == 1 )
+		{
+			packb_nr32_s8s8s32os32
+			(
+			  ( pack_b_buffer_s8s8s32o32 + ( n_full_pieces_loop_limit * KC_updated ) ),
+			  ( pack_b_column_sum + n_full_pieces_loop_limit ),
+			  ( b + n_full_pieces_loop_limit ), ldb, KC
+			);
+
+			n0_partial_pack = 32;
+		}
+		else if ( n0_16 == 1 )
+		{
+			packb_nr16_s8s8s32os32
+			(
+			  ( pack_b_buffer_s8s8s32o32 + ( n_full_pieces_loop_limit * KC_updated ) ),
+			  ( pack_b_column_sum + n_full_pieces_loop_limit ),
+			  ( b + n_full_pieces_loop_limit ), ldb, KC
+			);
+
+			n0_partial_pack = 16;
+		}
+
+		if ( n0_partial_rem > 0 )
+		{
+			packb_nrlt16_s8s8s32os32
+			(
+			  ( pack_b_buffer_s8s8s32o32 + ( n_full_pieces_loop_limit * KC_updated ) +
+				( n0_partial_pack * KC_updated ) ),
+			  ( pack_b_column_sum + n_full_pieces_loop_limit + n0_partial_pack ),
+			  ( b + n_full_pieces_loop_limit + n0_partial_pack ), ldb, KC,
+			  n0_partial_rem
+			);
+		}
+	}
+	*rs_b = NR * 4;
+	*cs_b = NR;
+}
+
+void packb_nr48_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   KC
+     )
+{
+	dim_t kr_new = 0;
+
+	dim_t k_full_pieces_blks = KC / 4;
+	dim_t k_full_pieces = k_full_pieces_blks * 4;
+	dim_t k_partial_pieces = KC % 4;
+
+	__m256i a0_32;
+	__m256i b0_32;
+	__m256i c0_32;
+	__m256i d0_32;
+	__m256i a01_32;
+	__m256i c01_32;
+	__m512i a0_zmm;
+	__m512i b0_zmm;
+	__m128i a0_16;
+	__m128i b0_16;
+	__m128i c0_16;
+	__m128i d0_16;
+	__m128i a01_16;
+	__m128i c01_16;
+
+	//to compute column sum of B matrix
+    	__m512i sum1, sum2, sum3;
+	__m512i mul_128 = _mm512_set1_epi32 (7);
+
+	//load the temp buffer to compute column sum of B matrix
+    	sum1 = _mm512_loadu_si512( pack_b_column_sum );
+	sum2 = _mm512_loadu_si512( pack_b_column_sum + 16 );  //offset 16- as 16 int32 elements fit in 1 zmm register
+	sum3 = _mm512_loadu_si512( pack_b_column_sum + 32 );
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
+	{
+		// Rearrange for vpdpbusd, read 4 rows from B with 32 elements in each row.
+		a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 0 ) ) );
+		b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 1 ) ) );
+		c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 2 ) ) );
+		d0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 3 ) ) );
+
+		//add all the columns : sum = add (sum, a0, b0, c0, d0)
+		sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+        	_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 0)),
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 0)),
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 0)),
+		_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( d0_32, 0))))) , mul_128));
+
+		sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+        	_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)),
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 1)),
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 1)),
+		_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( d0_32, 1))))) , mul_128));
+
+		a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 );
+		a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 );
+
+		c01_32 = _mm256_unpacklo_epi8( c0_32, d0_32 );
+		c0_32 = _mm256_unpackhi_epi8( c0_32, d0_32 );
+
+		b0_32 = _mm256_unpacklo_epi16( a01_32, c01_32 );
+		a01_32 = _mm256_unpackhi_epi16( a01_32, c01_32 );
+
+		d0_32 = _mm256_unpacklo_epi16( a0_32, c0_32 );
+		c01_32 = _mm256_unpackhi_epi16( a0_32, c0_32 );
+
+		a0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x0 ); // 0 elem
+		c0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x3 ); // 2 elem
+		b0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x0 ); // 1 elem
+		d0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x3 ); // 3 elem
+
+		a0_zmm = _mm512_castsi256_si512( a0_32 );
+		a0_zmm = _mm512_inserti32x8( a0_zmm, b0_32, 0x1 );
+		b0_zmm = _mm512_castsi256_si512( c0_32 );
+		b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 );
+
+		// First 4x32 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
+
+		// Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row.
+		a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 0 ) ) + ( 32 ) );
+		b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 1 ) ) + ( 32 ) );
+		c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 2 ) ) + ( 32 ) );
+		d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 3 ) ) + ( 32 ) );
+
+		//add all the columns : sum = add (sum, a0_32, b0_32, c0_32, d0_32)
+		sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ), 
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32( b0_16 ), _mm512_add_epi32 ( _mm512_cvtepi8_epi32( c0_16 ), 
+		_mm512_cvtepi8_epi32( d0_16 )))) , mul_128 ));
+
+		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
+		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
+
+		c01_16 = _mm_unpacklo_epi8( c0_16, d0_16 );
+		c0_16 = _mm_unpackhi_epi8( c0_16, d0_16 );
+
+		b0_16 = _mm_unpacklo_epi16( a01_16, c01_16 ); // 0 elem
+		a01_16 = _mm_unpackhi_epi16( a01_16, c01_16 ); // 1 elem
+		d0_16 = _mm_unpacklo_epi16( a0_16, c0_16 ); // 2 elem
+		c01_16 = _mm_unpackhi_epi16( a0_16, c0_16 ); // 3 elem
+
+		a0_zmm = _mm512_castsi128_si512( b0_16 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
+
+		// Last 4x16 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm );
+
+		// The 4th 16byte chunk will be ignored, since its not part of the original data,
+		// but is here due to the packing in 4 16byte chunks format.
+		kr_new += 3;
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		if ( k_partial_pieces == 3 )
+		{
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) );
+			d0_32 = _mm256_setzero_si256();
+
+			//add all the columns : sum = add (sum, a0, b0, c0)
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+        		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 0)),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 0)),
+			_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 0)))) , mul_128));
+
+			sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+        		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 1)),
+			_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 1)))) , mul_128));
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) );
+			c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) + ( 32 ) );
+			d0_16 = _mm_setzero_si128();
+
+			sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 (_mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32( b0_16 ), _mm512_cvtepi8_epi32( c0_16 ))) , mul_128));
+
+		}
+		else if( k_partial_pieces == 2 )
+		{
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_32 = _mm256_setzero_si256();
+			d0_32 = _mm256_setzero_si256();
+
+			//add all the columns : sum = add (sum, a0, b0)
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+        		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 0)),
+			_mm512_cvtepi8_epi32( _mm256_extracti32x4_epi32 ( b0_32, 0) )) , mul_128 ));
+
+			sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+        		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)),
+			_mm512_cvtepi8_epi32( _mm256_extracti32x4_epi32 ( b0_32, 1) )) , mul_128 ));
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) );
+			c0_16 = _mm_setzero_si128();
+			d0_16 = _mm_setzero_si128();
+
+			sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ),
+			_mm512_cvtepi8_epi32( b0_16 )) , mul_128));
+		}
+		else //k_partial_pieces == 1
+		{
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_setzero_si256();
+			c0_32 = _mm256_setzero_si256();
+			d0_32 = _mm256_setzero_si256();
+
+			//add all the columns : sum = add (sum, a0, b0)
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+        		_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 0)) , mul_128));
+
+			sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+        		_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)) , mul_128));
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
+			b0_16 = _mm_setzero_si128();
+			c0_16 = _mm_setzero_si128();
+			d0_16 = _mm_setzero_si128();
+
+			sum3 = _mm512_add_epi32 ( sum3, _mm512_sllv_epi32 (
+        		_mm512_cvtepi8_epi32( a0_16 ) , mul_128));
+		}
+
+		a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 );
+		a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 );
+
+		c01_32 = _mm256_unpacklo_epi8( c0_32, d0_32 );
+		c0_32 = _mm256_unpackhi_epi8( c0_32, d0_32 );
+
+		b0_32 = _mm256_unpacklo_epi16( a01_32, c01_32 );
+		a01_32 = _mm256_unpackhi_epi16( a01_32, c01_32 );
+
+		d0_32 = _mm256_unpacklo_epi16( a0_32, c0_32 );
+		c01_32 = _mm256_unpackhi_epi16( a0_32, c0_32 );
+
+		a0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x0 ); // 0 elem
+		c0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x3 ); // 2 elem
+		b0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x0 ); // 1 elem
+		d0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x3 ); // 3 elem
+
+		a0_zmm = _mm512_castsi256_si512( a0_32 );
+		a0_zmm = _mm512_inserti32x8( a0_zmm, b0_32, 0x1 );
+		b0_zmm = _mm512_castsi256_si512( c0_32 );
+		b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 );
+
+		// First 4x32 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
+
+		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
+		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
+
+		c01_16 = _mm_unpacklo_epi8( c0_16, d0_16 );
+		c0_16 = _mm_unpackhi_epi8( c0_16, d0_16 );
+
+		b0_16 = _mm_unpacklo_epi16( a01_16, c01_16 ); // 0 elem
+		a01_16 = _mm_unpackhi_epi16( a01_16, c01_16 ); // 1 elem
+		d0_16 = _mm_unpacklo_epi16( a0_16, c0_16 ); // 2 elem
+		c01_16 = _mm_unpackhi_epi16( a0_16, c0_16 ); // 3 elem
+
+		a0_zmm = _mm512_castsi128_si512( b0_16 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
+
+		// Last 4x16 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm );
+	}
+	//store the sum column
+	_mm512_storeu_si512( pack_b_column_sum, sum1 );
+	_mm512_storeu_si512( pack_b_column_sum + 16, sum2 );
+	_mm512_storeu_si512( pack_b_column_sum + 32, sum3 );
+}
+
+void packb_nr32_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   KC
+     )
+{
+	dim_t kr_new = 0;
+
+	dim_t k_full_pieces_blks = KC / 4;
+	dim_t k_full_pieces = k_full_pieces_blks * 4;
+	dim_t k_partial_pieces = KC % 4;
+
+	__m256i a0_32;
+	__m256i b0_32;
+	__m256i c0_32;
+	__m256i d0_32;
+	__m256i a01_32;
+	__m256i c01_32;
+	__m512i a0_zmm;
+	__m512i b0_zmm;
+
+	//to compute column sum of B matrix
+    	__m512i sum1, sum2;
+	__m512i mul_128 = _mm512_set1_epi32 (7);
+
+	//load the temp buffer to compute column sum of B matrix
+    	sum1 = _mm512_loadu_si512( pack_b_column_sum );
+	sum2 = _mm512_loadu_si512( pack_b_column_sum + 16 );  //offset 16- as 16 int32 elements fit in 1 zmm register
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
+	{
+		// Rearrange for vpdpbusd, read 4 rows from B with 32 elements in each row.
+		a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 0 ) ) );
+		b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 1 ) ) );
+		c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 2 ) ) );
+		d0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 3 ) ) );
+
+		//add all the columns : sum = add (sum, a0, b0, c0, d0)
+		sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+        	_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 0)),
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 0)),
+		_mm512_add_epi32 (_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 0)),
+		_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( d0_32, 0))))) , mul_128));
+
+		sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+        	_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)),
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 1)),
+		_mm512_add_epi32 (_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 1)),
+		_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( d0_32, 1))))) , mul_128));
+
+		a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 );
+		a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 );
+
+		c01_32 = _mm256_unpacklo_epi8( c0_32, d0_32 );
+		c0_32 = _mm256_unpackhi_epi8( c0_32, d0_32 );
+
+		b0_32 = _mm256_unpacklo_epi16( a01_32, c01_32 );
+		a01_32 = _mm256_unpackhi_epi16( a01_32, c01_32 );
+
+		d0_32 = _mm256_unpacklo_epi16( a0_32, c0_32 );
+		c01_32 = _mm256_unpackhi_epi16( a0_32, c0_32 );
+
+		a0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x0 ); // 0 elem
+		c0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x3 ); // 2 elem
+		b0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x0 ); // 1 elem
+		d0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x3 ); // 3 elem
+
+		a0_zmm = _mm512_castsi256_si512( a0_32 );
+		a0_zmm = _mm512_inserti32x8( a0_zmm, b0_32, 0x1 );
+		b0_zmm = _mm512_castsi256_si512( c0_32 );
+		b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 );
+
+		// First 4x32 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
+
+		// The 3rd and 4th 16byte chunk will be ignored, since its not part of the original data,
+		// but is here due to the packing in 4 16byte chunks format.
+		kr_new += 2;
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		if ( k_partial_pieces == 3 )
+		{
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) );
+			d0_32 = _mm256_setzero_si256();
+
+			//add all the columns : sum = add (sum, a0, b0, c0)
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+        		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 0)),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 0)),
+			_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 0)))) , mul_128));
+
+			sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+        		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 1)),
+			_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( c0_32, 1)))) , mul_128));
+
+		}
+		else if( k_partial_pieces == 2 )
+		{
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_32 = _mm256_setzero_si256();
+			d0_32 = _mm256_setzero_si256();
+
+			//add all the columns : sum = add (sum, a0, b0)
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+        		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 0)),
+			_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 0))) , mul_128));
+
+			sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+        		_mm512_add_epi32 ( _mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)),
+			_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( b0_32, 1))) , mul_128));
+		}
+		else //k_partial_pieces == 1
+		{
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_setzero_si256();
+			c0_32 = _mm256_setzero_si256();
+			d0_32 = _mm256_setzero_si256();
+
+			//add all the columns : sum = add (sum, a0, b0)
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (
+        		_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 0)) , mul_128));
+
+			sum2 = _mm512_add_epi32 ( sum2, _mm512_sllv_epi32 (
+        		_mm512_cvtepi8_epi32(_mm256_extracti32x4_epi32 ( a0_32, 1)) , mul_128));
+		}
+
+		a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 );
+		a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 );
+
+		c01_32 = _mm256_unpacklo_epi8( c0_32, d0_32 );
+		c0_32 = _mm256_unpackhi_epi8( c0_32, d0_32 );
+
+		b0_32 = _mm256_unpacklo_epi16( a01_32, c01_32 );
+		a01_32 = _mm256_unpackhi_epi16( a01_32, c01_32 );
+
+		d0_32 = _mm256_unpacklo_epi16( a0_32, c0_32 );
+		c01_32 = _mm256_unpackhi_epi16( a0_32, c0_32 );
+
+		a0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x0 ); // 0 elem
+		c0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x3 ); // 2 elem
+		b0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x0 ); // 1 elem
+		d0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x3 ); // 3 elem
+
+		a0_zmm = _mm512_castsi256_si512( a0_32 );
+		a0_zmm = _mm512_inserti32x8( a0_zmm, b0_32, 0x1 );
+		b0_zmm = _mm512_castsi256_si512( c0_32 );
+		b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 );
+
+		// First 4x32 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
+	}
+	//store the sum column
+	_mm512_storeu_si512( pack_b_column_sum, sum1 );
+	_mm512_storeu_si512( pack_b_column_sum + 16, sum2 );
+}
+
+void packb_nr16_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   KC
+     )
+{
+	dim_t kr_new = 0;
+
+	dim_t k_full_pieces_blks = KC / 4;
+	dim_t k_full_pieces = k_full_pieces_blks * 4;
+	dim_t k_partial_pieces = KC % 4;
+
+	__m128i a0_16;
+	__m128i b0_16;
+	__m128i c0_16;
+	__m128i d0_16;
+	__m128i a01_16;
+	__m128i c01_16;
+	__m512i a0_zmm;
+
+	//to compute column sum of B matrix
+    	__m512i sum1;
+	__m512i mul_128 = _mm512_set1_epi32 (7);
+
+	//load the temp buffer to compute column sum of B matrix
+    	sum1 = _mm512_loadu_si512( pack_b_column_sum );
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
+	{
+		// Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row.
+		a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 0 ) ) );
+		b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 1 ) ) );
+		c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 2 ) ) );
+		d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 3 ) ) );
+
+		//add all the columns : sum = add (sum, a0, b0, c0, d0)
+		sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ),
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32( b0_16 ), _mm512_add_epi32 ( _mm512_cvtepi8_epi32( c0_16 ),
+		_mm512_cvtepi8_epi32( d0_16 )))) , mul_128 ));
+
+		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
+		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
+
+		c01_16 = _mm_unpacklo_epi8( c0_16, d0_16 );
+		c0_16 = _mm_unpackhi_epi8( c0_16, d0_16 );
+
+		b0_16 = _mm_unpacklo_epi16( a01_16, c01_16 ); // 0 elem
+		a01_16 = _mm_unpackhi_epi16( a01_16, c01_16 ); // 1 elem
+		d0_16 = _mm_unpacklo_epi16( a0_16, c0_16 ); // 2 elem
+		c01_16 = _mm_unpackhi_epi16( a0_16, c0_16 ); // 3 elem
+
+		a0_zmm = _mm512_castsi128_si512( b0_16 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
+
+		// Last 4x16 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+
+		// The 2nd, 3rd, and 4th 16byte chunk will be ignored, since its not part of the original data,
+		// but is here due to the packing in 4 16byte chunks format.
+		kr_new += 1;
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		if ( k_partial_pieces == 3 )
+		{
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) );
+			d0_16 = _mm_setzero_si128();
+
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (_mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32( b0_16 ), _mm512_cvtepi8_epi32( c0_16 ))) , mul_128));
+
+		}
+		else if( k_partial_pieces == 2 )
+		{
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_16 = _mm_setzero_si128();
+			d0_16 = _mm_setzero_si128();
+
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ),
+			_mm512_cvtepi8_epi32( b0_16 )) , mul_128));
+		}
+		else //k_partial_pieces == 1
+		{
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_16 = _mm_setzero_si128();
+			c0_16 = _mm_setzero_si128();
+			d0_16 = _mm_setzero_si128();
+
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( _mm512_cvtepi8_epi32( a0_16 ) , mul_128 ));
+		}
+
+		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
+		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
+
+		c01_16 = _mm_unpacklo_epi8( c0_16, d0_16 );
+		c0_16 = _mm_unpackhi_epi8( c0_16, d0_16 );
+
+		b0_16 = _mm_unpacklo_epi16( a01_16, c01_16 ); // 0 elem
+		a01_16 = _mm_unpackhi_epi16( a01_16, c01_16 ); // 1 elem
+		d0_16 = _mm_unpacklo_epi16( a0_16, c0_16 ); // 2 elem
+		c01_16 = _mm_unpackhi_epi16( a0_16, c0_16 ); // 3 elem
+
+		__m512i a0_zmm = _mm512_castsi128_si512( b0_16 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
+
+		// Last 4x16 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+	}
+	//store the sum column
+	_mm512_storeu_si512( pack_b_column_sum, sum1 );
+}
+
+void packb_nrlt16_s8s8s32os32
+     (
+       int8_t*       pack_b_buffer_s8s8s32o32,
+       int32_t*      pack_b_column_sum,
+       const int8_t* b,
+       const dim_t   ldb,
+       const dim_t   KC,
+       const dim_t   n0_partial_rem
+     )
+{
+	int8_t buf0[16];
+	int8_t buf1[16];
+	int8_t buf2[16];
+	int8_t buf3[16];
+
+	dim_t kr_new = 0;
+
+	dim_t k_full_pieces_blks = KC / 4;
+	dim_t k_full_pieces = k_full_pieces_blks * 4;
+	dim_t k_partial_pieces = KC % 4;
+
+	__m128i a0_16;
+	__m128i b0_16;
+	__m128i c0_16;
+	__m128i d0_16;
+	__m128i a01_16;
+	__m128i c01_16;
+	__m512i a0_zmm;
+
+	//to compute column sum of B matrix
+    	__m512i sum1;
+	__m512i mul_128 = _mm512_set1_epi32 (7);
+
+	//load the temp buffer to compute column sum of B matrix
+    	sum1 = _mm512_loadu_si512( pack_b_column_sum );
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
+	{
+		memcpy( buf0, ( b + ( ldb * ( kr + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+		memcpy( buf1, ( b + ( ldb * ( kr + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+		memcpy( buf2, ( b + ( ldb * ( kr + 2 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+		memcpy( buf3, ( b + ( ldb * ( kr + 3 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+
+		// Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row.
+		a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 );
+		b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 );
+		c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf2 );
+		d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf3 );
+
+		//add all the columns : sum = add (sum, a0, b0, c0, d0)
+		sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ),
+		_mm512_add_epi32 ( _mm512_cvtepi8_epi32( b0_16 ), _mm512_add_epi32 ( _mm512_cvtepi8_epi32( c0_16 ),
+		_mm512_cvtepi8_epi32( d0_16 )))) , mul_128 ));
+
+		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
+		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
+
+		c01_16 = _mm_unpacklo_epi8( c0_16, d0_16 );
+		c0_16 = _mm_unpackhi_epi8( c0_16, d0_16 );
+
+		b0_16 = _mm_unpacklo_epi16( a01_16, c01_16 ); // 0 elem
+		a01_16 = _mm_unpackhi_epi16( a01_16, c01_16 ); // 1 elem
+		d0_16 = _mm_unpacklo_epi16( a0_16, c0_16 ); // 2 elem
+		c01_16 = _mm_unpackhi_epi16( a0_16, c0_16 ); // 3 elem
+
+		a0_zmm = _mm512_castsi128_si512( b0_16 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
+
+		// Last 4x16 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+
+		// The 2nd, 3rd, and 4th 16byte chunk will be ignored, since its not part of the original data,
+		// but is here due to the packing in 4 16byte chunks format.
+		kr_new += 1;
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		if ( k_partial_pieces == 3 )
+		{
+			memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+			memcpy( buf1, ( b + ( ldb * ( k_full_pieces + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+			memcpy( buf2, ( b + ( ldb * ( k_full_pieces + 2 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 );
+			c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf2 );
+			d0_16 = _mm_setzero_si128();
+
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 (_mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ),
+			_mm512_add_epi32 ( _mm512_cvtepi8_epi32( b0_16 ), _mm512_cvtepi8_epi32( c0_16 ))) , mul_128));
+
+		}
+		else if( k_partial_pieces == 2 )
+		{
+			memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+			memcpy( buf1, ( b + ( ldb * ( k_full_pieces + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 );
+			c0_16 = _mm_setzero_si128();
+			d0_16 = _mm_setzero_si128();
+
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( _mm512_add_epi32 ( _mm512_cvtepi8_epi32( a0_16 ),
+			_mm512_cvtepi8_epi32( b0_16 )) , mul_128));
+		}
+		else //k_partial_pieces == 1
+		{
+			memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 );
+			b0_16 = _mm_setzero_si128();
+			c0_16 = _mm_setzero_si128();
+			d0_16 = _mm_setzero_si128();
+
+			sum1 = _mm512_add_epi32 ( sum1, _mm512_sllv_epi32 ( _mm512_cvtepi8_epi32( a0_16 ) , mul_128 ));
+		}
+
+		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
+		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
+
+		c01_16 = _mm_unpacklo_epi8( c0_16, d0_16 );
+		c0_16 = _mm_unpackhi_epi8( c0_16, d0_16 );
+
+		b0_16 = _mm_unpacklo_epi16( a01_16, c01_16 ); // 0 elem
+		a01_16 = _mm_unpackhi_epi16( a01_16, c01_16 ); // 1 elem
+		d0_16 = _mm_unpacklo_epi16( a0_16, c0_16 ); // 2 elem
+		c01_16 = _mm_unpackhi_epi16( a0_16, c0_16 ); // 3 elem
+
+		__m512i a0_zmm = _mm512_castsi128_si512( b0_16 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
+		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
+
+		// Last 4x16 elements.
+		_mm512_storeu_si512( pack_b_buffer_s8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+	}
+	//store the sum column
+	_mm512_storeu_si512( pack_b_column_sum, sum1 );
+}
+#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c
similarity index 51%
rename from addon/aocl_gemm/kernels/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c
rename to kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c
index f249106a3c..f79cd8775a 100644
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_6x64rowmajor_amd512vnni.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,26 +33,31 @@
 */
 
 #include <immintrin.h>
-
 #include "blis.h"
-#include "lpgemm_kernels.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
 #include "lpgemm_s32_kern_macros.h"
+#include "lpgemm_s32_memcpy_macros.h"
 
-#ifdef BLIS_KERNELS_ZEN4
 // 6x64 int8o32 kernel
+__attribute__((aligned(64)))
 LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 {
 	static void* post_ops_labels[] =
-						{
-						  &&POST_OPS_6x64_DISABLE,
-						  &&POST_OPS_BIAS_6x64,
-						  &&POST_OPS_RELU_6x64,
-						  &&POST_OPS_RELU_SCALE_6x64,
-						  &&POST_OPS_DOWNSCALE_6x64
-						};
-
-	dim_t MR = 6;
-	dim_t NR = 64;
+				{
+				  &&POST_OPS_6x64_DISABLE,
+				  &&POST_OPS_BIAS_6x64,
+				  &&POST_OPS_RELU_6x64,
+				  &&POST_OPS_RELU_SCALE_6x64,
+				  &&POST_OPS_GELU_TANH_6x64,
+				  &&POST_OPS_GELU_ERF_6x64,
+				  &&POST_OPS_CLIP_6x64,
+				  &&POST_OPS_DOWNSCALE_6x64
+				};
+
+	const dim_t MR = 6;
+	const dim_t NR = 64;
 
 	dim_t m_full_pieces = m0 / MR;
 	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
@@ -61,8 +66,6 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 	dim_t k_full_pieces = k0 / 4;
 	dim_t k_partial_pieces = k0 % 4;
 
-	uint32_t a_kfringe_buf = 0;
-
 	if ( n0 < NR )
 	{
 		dim_t n0_rem = n0 % 16;
@@ -73,7 +76,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 		dim_t n0_48 = n0 / 48;
 		dim_t n0_32 = n0 / 32;
 		dim_t n0_16 = n0 / 16;
-		
+
 		// KC when not multiple of 4 will have padding to make it multiple of
 		// 4 in packed buffer. Also the k0 cannot be passed as the updated
 		// value since A matrix is not packed and requires original k0.
@@ -92,64 +95,56 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 			  b, ( ( rs_b / 4 ) * 3 ), cs_b,
 			  c, rs_c,
 			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 
 			b = b + ( 48 * k0_updated ); // k0x48 packed contiguosly.
 			c = c + 48;
-			post_op_c_j += 48;
+			post_ops_attr.post_op_c_j += 48;
 		}
 		else if ( n0_32 == 1 )
 		{
-			lpgemm_rowvar_u8s8s32o32_6x32
+			lpgemm_rowvar_u8s8s32o32_9x32
 			(
 			  m0, k0,
 			  a, rs_a, cs_a, ps_a,
 			  b, ( ( rs_b / 4 ) * 2 ), cs_b,
 			  c, rs_c,
 			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 
 			b = b + ( 32 * k0_updated ); // k0x32 packed contiguosly.
 			c = c + 32;
-			post_op_c_j += 32;
+			post_ops_attr.post_op_c_j += 32;
 		}
 		else if ( n0_16 == 1 )
 		{
-			lpgemm_rowvar_u8s8s32o32_6x16
+			lpgemm_rowvar_u8s8s32o32_12x16
 			(
 			  m0, k0,
 			  a, rs_a, cs_a, ps_a,
 			  b, ( ( rs_b / 4 ) * 1 ), cs_b,
 			  c, rs_c,
 			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 
 			b = b + ( 16 * k0_updated ); // k0x16 packed contiguosly.
 			c = c + 16;
-			post_op_c_j += 16;
+			post_ops_attr.post_op_c_j += 16;
 		}
 
 		if ( n0_rem > 0 )
 		{
-			lpgemm_rowvar_u8s8s32o32_6xlt16
+			lpgemm_rowvar_u8s8s32o32_12xlt16
 			(
 			  m0, k0,
 			  a, rs_a, cs_a, ps_a,
 			  b, ( ( rs_b / 4 ) * 1 ), cs_b,
 			  c, rs_c,
 			  alpha, beta, n0_rem,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 
 			// No leftover fringe after this point.
@@ -167,9 +162,15 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 	// A matrix storage.
 	__m512i a_int32_0;
 	__m512i a_int32_1;
+	__m512i a_int32_2;
+	__m512i a_int32_3;
 
+	_mm_prefetch( a, _MM_HINT_T0 );
 	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
 	{
+		_mm_prefetch( b, _MM_HINT_T0 );
+		_mm_prefetch( a + ( MR * ps_a ) + ( 0 * 16 ), _MM_HINT_T1 );
+
 		// Registers to use for accumulating C.
 		__m512i c_int32_0p0 = _mm512_setzero_epi32();
 		__m512i c_int32_0p1 = _mm512_setzero_epi32();
@@ -185,7 +186,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 		__m512i c_int32_2p1 = _mm512_setzero_epi32();
 		__m512i c_int32_2p2 = _mm512_setzero_epi32();
 		__m512i c_int32_2p3 = _mm512_setzero_epi32();
-		
+
 		__m512i c_int32_3p0 = _mm512_setzero_epi32();
 		__m512i c_int32_3p1 = _mm512_setzero_epi32();
 		__m512i c_int32_3p2 = _mm512_setzero_epi32();
@@ -211,66 +212,79 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 			// instructions and each load to ZMM register will have 4 elements
 			// along k direction and 16 elements across n directions, so 4x16
 			// elements to a ZMM register.
-			b0 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 0 ) );
-			
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
 			// Broadcast a[0,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
-			
-			b1 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 1 ) );
-			b2 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 2 ) );
-			b3 = _mm512_loadu_epi8( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+			a_int32_0 = _mm512_set1_epi32
+					(
+					  *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) )
+					);
+
+			b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+			b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+			b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32
+					(
+					  *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) )
+					);
 
 			// Perform column direction mat-mul with k = 4.
 			// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
 			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
-
-			// Broadcast a[1,kr:kr+4].
-			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
-			
 			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
 			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
 			c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
 
+			// Broadcast a[2,kr:kr+4].
+			a_int32_2 = _mm512_set1_epi32
+					(
+					  *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) )
+					);
+
 			// Perform column direction mat-mul with k = 4.
 			// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
 			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-			
-			// Broadcast a[2,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
-			
 			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
 			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
 			c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
 
+			// Broadcast a[3,kr:kr+4].
+			a_int32_3 = _mm512_set1_epi32
+					(
+					  *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) )
+					);
+
 			// Perform column direction mat-mul with k = 4.
 			// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
-			// Broadcast a[3,kr:kr+4].
-			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
-			
-			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-			c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_2, b1 );
+			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_2, b2 );
+			c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_2, b3 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32
+					(
+					  *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) )
+					);
 
 			// Perform column direction mat-mul with k = 4.
 			// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
-			
-			// Broadcast a[4,kr:kr+4].
-			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
-			
-			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
-			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
-			c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_3, b1 );
+			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_3, b2 );
+			c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_3, b3 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32
+					(
+					  *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) )
+					);
 
 			// Perform column direction mat-mul with k = 4.
 			// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
 			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			
-			// Broadcast a[5,kr:kr+4].
-			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
-			
 			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
 			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
 			c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
@@ -285,34 +299,35 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 		// Handle k remainder.
 		if ( k_partial_pieces > 0 )
 		{
-			b0 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
-			
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
 			// Broadcast a[0,kr:kr+4].
-			memcpy
+			a_kfringe_buf = _mm_maskz_loadu_epi8
 			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
 			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-			
-			b1 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
-			b2 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
-			b3 = _mm512_loadu_epi8( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+			b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+			b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
 
 			// Perform column direction mat-mul with k = 4.
 			// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
 			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
 
 			// Broadcast a[1,kr:kr+4].
-			memcpy
+			a_kfringe_buf = _mm_maskz_loadu_epi8
 			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
 			);
-			a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-			
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
 			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
 			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
 			c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
@@ -320,67 +335,63 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 			// Perform column direction mat-mul with k = 4.
 			// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
 			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
-			
+
 			// Broadcast a[2,kr:kr+4].
-			memcpy
+			a_kfringe_buf = _mm_maskz_loadu_epi8
 			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
 			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-			
+			a_int32_2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
 			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
 			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
 			c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
 
 			// Perform column direction mat-mul with k = 4.
 			// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
-			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
-			
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
 			// Broadcast a[3,kr:kr+4].
-			memcpy
+			a_kfringe_buf = _mm_maskz_loadu_epi8
 			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
 			);
-			a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-			
-			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
-			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
-			c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+			a_int32_3 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_2, b1 );
+			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_2, b2 );
+			c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_2, b3 );
 
 			// Perform column direction mat-mul with k = 4.
 			// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
-			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
-			
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
 			// Broadcast a[4,kr:kr+4].
-			memcpy
+			a_kfringe_buf = _mm_maskz_loadu_epi8
 			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
 			);
-			a_int32_0 = _mm512_set1_epi32( a_kfringe_buf );
-			
-			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
-			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
-			c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_3, b1 );
+			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_3, b2 );
+			c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_3, b3 );
 
 			// Perform column direction mat-mul with k = 4.
 			// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
 			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
-			
+
 			// Broadcast a[5,kr:kr+4].
-			memcpy
+			a_kfringe_buf = _mm_maskz_loadu_epi8
 			(
-			  &a_kfringe_buf,
-			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) ),
-			  ( k_partial_pieces * sizeof( uint8_t ) )
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
 			);
-			a_int32_1 = _mm512_set1_epi32( a_kfringe_buf );
-			
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
 			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
 			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
 			c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
@@ -397,159 +408,86 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 		__m512i selector1 = _mm512_set1_epi32( alpha );
 		__m512i selector2 = _mm512_set1_epi32( beta );
 
-		// Scale by alpha
-		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
-		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
-		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
-		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
-
-		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
-		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
-		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
-		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
-		
-		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
-		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
-		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
-		c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
-		
-		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
-		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
-		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
-		c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
-		
-		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
-		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
-		c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
-		c_int32_4p3 = _mm512_mullo_epi32( selector1, c_int32_4p3 );
-		
-		c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
-		c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
-		c_int32_5p2 = _mm512_mullo_epi32( selector1, c_int32_5p2 );
-		c_int32_5p3 = _mm512_mullo_epi32( selector1, c_int32_5p3 );
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+			c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+			c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+			c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+			c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+			c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+			c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+			c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+			c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+			c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+			c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+			c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+			c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+			c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+			c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
+			c_int32_4p3 = _mm512_mullo_epi32( selector1, c_int32_4p3 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+			c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
+			c_int32_5p2 = _mm512_mullo_epi32( selector1, c_int32_5p2 );
+			c_int32_5p3 = _mm512_mullo_epi32( selector1, c_int32_5p3 );
+		}
 
 		// Scale C by beta.
 		if ( beta != 0 )
 		{
-			// c[0,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
-
-			// c[0, 16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p1 = _mm512_add_epi32( selector1, c_int32_0p1 );
-
-			// c[0,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p2 = _mm512_add_epi32( selector1, c_int32_0p2 );
-
-			// c[0,48-63]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_0p3 = _mm512_add_epi32( selector1, c_int32_0p3 );
-
-			// c[1,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
-
-			// c[1,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p1 = _mm512_add_epi32( selector1, c_int32_1p1 );
-
-			// c[1,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p2 = _mm512_add_epi32( selector1, c_int32_1p2 );
-
-			// c[1,48-63]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_1p3 = _mm512_add_epi32( selector1, c_int32_1p3 );
+			// For the downscaled api (C-s8), the output C matrix values needs
+			// to be upscaled to s32 to be used for beta scale.
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,0,selector1,selector2);
 
-			// c[2,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+				// c[1:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,1,selector1,selector2);
 
-			// c[2,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p1 = _mm512_add_epi32( selector1, c_int32_2p1 );
+				// c[2:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,2,selector1,selector2);
 
-			// c[2,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p2 = _mm512_add_epi32( selector1, c_int32_2p2 );
+				// c[3:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,3,selector1,selector2);
 
-			// c[2,48-63]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_2p3 = _mm512_add_epi32( selector1, c_int32_2p3 );
+				// c[4:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,4,selector1,selector2);
 
-			// c[3,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+				// c[5:0-15,16-31,32-47,48-63]
+				S8_S32_BETA_OP4(ir,5,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,0,selector1,selector2);
 
-			// c[3,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p1 = _mm512_add_epi32( selector1, c_int32_3p1 );
+				// c[1:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,1,selector1,selector2);
 
-			// c[3,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p2 = _mm512_add_epi32( selector1, c_int32_3p2 );
+				// c[2:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,2,selector1,selector2);
 
-			// c[3,48-63]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_3p3 = _mm512_add_epi32( selector1, c_int32_3p3 );
+				// c[3:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,3,selector1,selector2);
 
-			// c[4,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+				// c[4:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,4,selector1,selector2);
 
-			// c[4,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p1 = _mm512_add_epi32( selector1, c_int32_4p1 );
-
-			// c[4,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p2 = _mm512_add_epi32( selector1, c_int32_4p2 );
-
-			// c[4,48-63]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_4p3 = _mm512_add_epi32( selector1, c_int32_4p3 );
-
-			// c[5,0-15]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
-
-			// c[5,16-31]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p1 = _mm512_add_epi32( selector1, c_int32_5p1 );
-
-			// c[5,32-47]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p2 = _mm512_add_epi32( selector1, c_int32_5p2 );
-
-			// c[5,48-63]
-			selector1 = _mm512_loadu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ) );
-			selector1 = _mm512_mullo_epi32( selector2, selector1 );
-			c_int32_5p3 = _mm512_add_epi32( selector1, c_int32_5p3 );
+				// c[5:0-15,16-31,32-47,48-63]
+				S32_S32_BETA_OP4(ir,5,selector1,selector2);
+			}
 		}
 
 		// Post Ops
@@ -558,17 +496,17 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 POST_OPS_BIAS_6x64:
 		{
 			selector1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 0 * 16 ) );
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j + ( 0 * 16 ) );
 			selector2 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 1 * 16 ) );
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j + ( 1 * 16 ) );
 			a_int32_0 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 2 * 16 ) );
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j + ( 2 * 16 ) );
 			a_int32_1 =
-				_mm512_loadu_epi32( ( int32_t* )post_ops_list_temp->op_args1 +
-								post_op_c_j + ( 3 * 16 ) );
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+						post_ops_attr.post_op_c_j + ( 3 * 16 ) );
 
 			// c[0,0-15]
 			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
@@ -804,173 +742,410 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 
 			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 		}
+POST_OPS_GELU_TANH_6x64:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+
+			// Pack the output registers into an array and apply gelu
+			// on the array in a loop. Helps avoid lot intruction
+			// duplication and thus potentially bad code gen.
+			int32_t temp_buf[384] __attribute__((aligned(64)));
+			dim_t temp_buf_4elem_len = 384 / 16;
+
+			S32_GELU_LOAD1R_4C(temp_buf,0,16,c_int32_0)
+			S32_GELU_LOAD1R_4C(temp_buf,4,16,c_int32_1)
+			S32_GELU_LOAD1R_4C(temp_buf,8,16,c_int32_2)
+			S32_GELU_LOAD1R_4C(temp_buf,12,16,c_int32_3)
+			S32_GELU_LOAD1R_4C(temp_buf,16,16,c_int32_4)
+			S32_GELU_LOAD1R_4C(temp_buf,20,16,c_int32_5)
+
+			for ( dim_t gelu_id = 0; gelu_id < temp_buf_4elem_len; ++gelu_id )
+			{
+				c_int32_0p0 = _mm512_loadu_si512( temp_buf + ( gelu_id * 16 ) );
+
+				GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, selector1)
+
+				_mm512_storeu_si512( temp_buf + ( gelu_id * 16 ), c_int32_0p0 );
+			}
+
+			S32_GELU_STORE1R_4C(temp_buf,0,16,c_int32_0)
+			S32_GELU_STORE1R_4C(temp_buf,4,16,c_int32_1)
+			S32_GELU_STORE1R_4C(temp_buf,8,16,c_int32_2)
+			S32_GELU_STORE1R_4C(temp_buf,12,16,c_int32_3)
+			S32_GELU_STORE1R_4C(temp_buf,16,16,c_int32_4)
+			S32_GELU_STORE1R_4C(temp_buf,20,16,c_int32_5)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x64:
+		{
+			__m512 x, r, y, x_erf;
+
+			// Pack the output registers into an array and apply gelu
+			// on the array in a loop. Helps avoid lot intruction
+			// duplication and thus potentially bad code gen.
+			int32_t temp_buf[384] __attribute__((aligned(64)));
+			dim_t temp_buf_4elem_len = 384 / 16;
+			S32_GELU_LOAD1R_4C(temp_buf,0,16,c_int32_0)
+			S32_GELU_LOAD1R_4C(temp_buf,4,16,c_int32_1)
+			S32_GELU_LOAD1R_4C(temp_buf,8,16,c_int32_2)
+			S32_GELU_LOAD1R_4C(temp_buf,12,16,c_int32_3)
+			S32_GELU_LOAD1R_4C(temp_buf,16,16,c_int32_4)
+			S32_GELU_LOAD1R_4C(temp_buf,20,16,c_int32_5)
+
+			for ( dim_t gelu_id = 0; gelu_id < temp_buf_4elem_len; ++gelu_id )
+			{
+				c_int32_0p0 = _mm512_loadu_si512( temp_buf + ( gelu_id * 16 ) );
+
+				GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+				_mm512_storeu_si512( temp_buf + ( gelu_id * 16 ), c_int32_0p0 );
+			}
+
+			S32_GELU_STORE1R_4C(temp_buf,0,16,c_int32_0)
+			S32_GELU_STORE1R_4C(temp_buf,4,16,c_int32_1)
+			S32_GELU_STORE1R_4C(temp_buf,8,16,c_int32_2)
+			S32_GELU_STORE1R_4C(temp_buf,12,16,c_int32_3)
+			S32_GELU_STORE1R_4C(temp_buf,16,16,c_int32_4)
+			S32_GELU_STORE1R_4C(temp_buf,20,16,c_int32_5)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x64:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+			// c[0, 32-47]
+			CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+			// c[0, 48-63]
+			CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+			// c[1, 32-47]
+			CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+			// c[1, 48-63]
+			CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+			// c[2, 32-47]
+			CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+			// c[2, 48-63]
+			CLIP_S32_AVX512(c_int32_2p3, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+			// c[3, 32-47]
+			CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+			// c[3, 48-63]
+			CLIP_S32_AVX512(c_int32_3p3, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+			// c[4, 32-47]
+			CLIP_S32_AVX512(c_int32_4p2, min, max)
+
+			// c[4, 48-63]
+			CLIP_S32_AVX512(c_int32_4p3, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_S32_AVX512(c_int32_5p1, min, max)
+
+			// c[5, 32-47]
+			CLIP_S32_AVX512(c_int32_5p2, min, max)
+
+			// c[5, 48-63]
+			CLIP_S32_AVX512(c_int32_5p3, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
 POST_OPS_DOWNSCALE_6x64:
 		{
 			selector1 =
-				_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-								post_op_c_j + ( 0 * 16 ) );
+				_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+						post_ops_attr.post_op_c_j + ( 0 * 16 ) );
 			selector2 =
-				_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-								post_op_c_j + ( 1 * 16 ) );
+				_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+						post_ops_attr.post_op_c_j + ( 1 * 16 ) );
 			a_int32_0 =
-				_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-								post_op_c_j + ( 2 * 16 ) );
+				_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+						post_ops_attr.post_op_c_j + ( 2 * 16 ) );
 			a_int32_1 =
-				_mm512_loadu_epi32( ( float* )post_ops_list_temp->scale_factor +
-								post_op_c_j + ( 3 * 16 ) );
+				_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+						post_ops_attr.post_op_c_j + ( 3 * 16 ) );
 
 			// c[0, 0-15]
-			CVT_MULRND_CVT32_CVT8(c_int32_0p0,selector1,0,0);
+			CVT_MULRND_CVT32(c_int32_0p0,selector1);
 
 			// c[0, 16-31]
-			CVT_MULRND_CVT32_CVT8(c_int32_0p1,selector2,0,1);
+			CVT_MULRND_CVT32(c_int32_0p1,selector2);
 
 			// c[0, 32-47]
-			CVT_MULRND_CVT32_CVT8(c_int32_0p2,a_int32_0,0,2);
+			CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
 
 			// c[0, 48-63]
-			CVT_MULRND_CVT32_CVT8(c_int32_0p3,a_int32_1,0,3);
+			CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
 
 			// c[1, 0-15]
-			CVT_MULRND_CVT32_CVT8(c_int32_1p0,selector1,1,0);
+			CVT_MULRND_CVT32(c_int32_1p0,selector1);
 
 			// c[1, 16-31]
-			CVT_MULRND_CVT32_CVT8(c_int32_1p1,selector2,1,1);
+			CVT_MULRND_CVT32(c_int32_1p1,selector2);
 
 			// c[1, 32-47]
-			CVT_MULRND_CVT32_CVT8(c_int32_1p2,a_int32_0,1,2);
+			CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
 
 			// c[1, 48-63]
-			CVT_MULRND_CVT32_CVT8(c_int32_1p3,a_int32_1,1,3);
+			CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
 
 			// c[2, 0-15]
-			CVT_MULRND_CVT32_CVT8(c_int32_2p0,selector1,2,0);
+			CVT_MULRND_CVT32(c_int32_2p0,selector1);
 
 			// c[2, 16-31]
-			CVT_MULRND_CVT32_CVT8(c_int32_2p1,selector2,2,1);
+			CVT_MULRND_CVT32(c_int32_2p1,selector2);
 
 			// c[2, 32-47]
-			CVT_MULRND_CVT32_CVT8(c_int32_2p2,a_int32_0,2,2);
+			CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
 
 			// c[2, 48-63]
-			CVT_MULRND_CVT32_CVT8(c_int32_2p3,a_int32_1,2,3);
+			CVT_MULRND_CVT32(c_int32_2p3,a_int32_1);
 
 			// c[3, 0-15]
-			CVT_MULRND_CVT32_CVT8(c_int32_3p0,selector1,3,0);
+			CVT_MULRND_CVT32(c_int32_3p0,selector1);
 
 			// c[3, 16-31]
-			CVT_MULRND_CVT32_CVT8(c_int32_3p1,selector2,3,1);
+			CVT_MULRND_CVT32(c_int32_3p1,selector2);
 
 			// c[3, 32-47]
-			CVT_MULRND_CVT32_CVT8(c_int32_3p2,a_int32_0,3,2);
+			CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
 
 			// c[3, 48-63]
-			CVT_MULRND_CVT32_CVT8(c_int32_3p3,a_int32_1,3,3);
+			CVT_MULRND_CVT32(c_int32_3p3,a_int32_1);
 
 			// c[4, 0-15]
-			CVT_MULRND_CVT32_CVT8(c_int32_4p0,selector1,4,0);
+			CVT_MULRND_CVT32(c_int32_4p0,selector1);
 
 			// c[4, 16-31]
-			CVT_MULRND_CVT32_CVT8(c_int32_4p1,selector2,4,1);
+			CVT_MULRND_CVT32(c_int32_4p1,selector2);
 
 			// c[4, 32-47]
-			CVT_MULRND_CVT32_CVT8(c_int32_4p2,a_int32_0,4,2);
+			CVT_MULRND_CVT32(c_int32_4p2,a_int32_0);
 
 			// c[4, 48-63]
-			CVT_MULRND_CVT32_CVT8(c_int32_4p3,a_int32_1,4,3);
+			CVT_MULRND_CVT32(c_int32_4p3,a_int32_1);
 
 			// c[5, 0-15]
-			CVT_MULRND_CVT32_CVT8(c_int32_5p0,selector1,5,0);
+			CVT_MULRND_CVT32(c_int32_5p0,selector1);
 
 			// c[5, 16-31]
-			CVT_MULRND_CVT32_CVT8(c_int32_5p1,selector2,5,1);
+			CVT_MULRND_CVT32(c_int32_5p1,selector2);
 
 			// c[5, 32-47]
-			CVT_MULRND_CVT32_CVT8(c_int32_5p2,a_int32_0,5,2);
+			CVT_MULRND_CVT32(c_int32_5p2,a_int32_0);
 
 			// c[5, 48-63]
-			CVT_MULRND_CVT32_CVT8(c_int32_5p3,a_int32_1,5,3);
+			CVT_MULRND_CVT32(c_int32_5p3,a_int32_1);
 
 			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
 		}
 POST_OPS_6x64_DISABLE:
 		;
 
-		// Store the results.
-		// c[0,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+		// Case where the output C matrix is s8 (downscaled) and this is the
+		// final write for a given block within C.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
 
-		// c[0, 16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
 
-		// c[0,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 );
+			// c[0,16-31]
+			CVT_STORE_S32_S8(c_int32_0p1,0,1);
 
-		// c[0,48-63]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_int32_0p3 );
+			// c[0,32-47]
+			CVT_STORE_S32_S8(c_int32_0p2,0,2);
 
-		// c[1,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+			// c[0,48-63]
+			CVT_STORE_S32_S8(c_int32_0p3,0,3);
 
-		// c[1,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
 
-		// c[1,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 );
+			// c[1,16-31]
+			CVT_STORE_S32_S8(c_int32_1p1,1,1);
 
-		// c[1,48-63]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_int32_1p3 );
+			// c[1,32-47]
+			CVT_STORE_S32_S8(c_int32_1p2,1,2);
 
-		// c[2,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+			// c[1,48-63]
+			CVT_STORE_S32_S8(c_int32_1p3,1,3);
 
-		// c[2,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
 
-		// c[2,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 );
+			// c[2,16-31]
+			CVT_STORE_S32_S8(c_int32_2p1,2,1);
 
-		// c[2,48-63]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_int32_2p3 );
+			// c[2,32-47]
+			CVT_STORE_S32_S8(c_int32_2p2,2,2);
 
-		// c[3,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+			// c[2,48-63]
+			CVT_STORE_S32_S8(c_int32_2p3,2,3);
 
-		// c[3,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
 
-		// c[3,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 );
+			// c[3,16-31]
+			CVT_STORE_S32_S8(c_int32_3p1,3,1);
 
-		// c[3,48-63]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_int32_3p3 );
+			// c[3,32-47]
+			CVT_STORE_S32_S8(c_int32_3p2,3,2);
 
-		// c[4,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+			// c[3,48-63]
+			CVT_STORE_S32_S8(c_int32_3p3,3,3);
 
-		// c[4,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
 
-		// c[4,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 );
+			// c[4,16-31]
+			CVT_STORE_S32_S8(c_int32_4p1,4,1);
 
-		// c[4,48-63]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_int32_4p3 );
+			// c[4,32-47]
+			CVT_STORE_S32_S8(c_int32_4p2,4,2);
 
-		// c[5,0-15]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+			// c[4,48-63]
+			CVT_STORE_S32_S8(c_int32_4p3,4,3);
 
-		// c[5,16-31]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[5,16-31]
+			CVT_STORE_S32_S8(c_int32_5p1,5,1);
+
+			// c[5,32-47]
+			CVT_STORE_S32_S8(c_int32_5p2,5,2);
+
+			// c[5,48-63]
+			CVT_STORE_S32_S8(c_int32_5p3,5,3);
+		}
+		// Case where the output C matrix is s32 or is the temp buffer used to
+		// store intermediate s32 accumulated values for downscaled (C-s8) api.
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
+
+			// c[0,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 );
+
+			// c[0,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 3*16 ), c_int32_0p3 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
+
+			// c[1,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 );
+
+			// c[1,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 3*16 ), c_int32_1p3 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
 
-		// c[5,32-47]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 );
+			// c[2,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 );
+
+			// c[2,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 3*16 ), c_int32_2p3 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
+
+			// c[3,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 );
+
+			// c[3,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 3*16 ), c_int32_3p3 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
+
+			// c[4,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 );
+
+			// c[4,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 3*16 ), c_int32_4p3 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
+
+			// c[5,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 );
+
+			// c[5,48-63]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_int32_5p3 );
+		}
 
-		// c[5,48-63]
-		_mm512_storeu_epi32( c + ( rs_c * ( ir + 5 ) ) + ( 3*16 ), c_int32_5p3 );
-		
 		a = a + ( MR * ps_a );
-		post_op_c_i += MR;
+		post_ops_attr.post_op_c_i += MR;
 	}
 
 	if ( m_partial_pieces > 0 )
@@ -991,9 +1166,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 			  b, rs_b, cs_b,
 			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
 			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 		}
 		else if ( m_partial_pieces == 4 )
@@ -1006,9 +1179,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 			  b, rs_b, cs_b,
 			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
 			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 		}
 		else if ( m_partial_pieces == 3 )
@@ -1021,9 +1192,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 			  b, rs_b, cs_b,
 			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
 			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 		}
 		else if ( m_partial_pieces == 2 )
@@ -1036,9 +1205,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 			  b, rs_b, cs_b,
 			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
 			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 		}
 		else if ( m_partial_pieces == 1 )
@@ -1051,9 +1218,7 @@ LPGEMM_MAIN_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x64)
 			  b, rs_b, cs_b,
 			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
 			  alpha, beta,
-			  is_last_k,
-			  post_op_c_i, post_op_c_j,
-			  post_ops_list, rs_c_downscale
+			  post_ops_list, post_ops_attr
 			);
 		}
 	}
diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c
new file mode 100644
index 0000000000..bcaa2d81c3
--- /dev/null
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_m_fringe_amd512vnni.c
@@ -0,0 +1,3084 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_s32_kern_macros.h"
+#include "lpgemm_s32_memcpy_macros.h"
+
+// 5x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x64_DISABLE,
+						  &&POST_OPS_BIAS_5x64,
+						  &&POST_OPS_RELU_5x64,
+						  &&POST_OPS_RELU_SCALE_5x64,
+						  &&POST_OPS_GELU_TANH_5x64,
+						  &&POST_OPS_GELU_ERF_5x64,
+						  &&POST_OPS_CLIP_5x64,
+						  &&POST_OPS_DOWNSCALE_5x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+	
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+	__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+	__m512i c_int32_2p3 = _mm512_setzero_epi32();
+	
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+	__m512i c_int32_3p2 = _mm512_setzero_epi32();
+	__m512i c_int32_3p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_4p0 = _mm512_setzero_epi32();
+	__m512i c_int32_4p1 = _mm512_setzero_epi32();
+	__m512i c_int32_4p2 = _mm512_setzero_epi32();
+	__m512i c_int32_4p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+		c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-63] = a[4,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+		c_int32_4p3 = _mm512_dpbusd_epi32( c_int32_4p3, a_int32_0, b3 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+		c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+		c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
+
+		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+		c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
+		c_int32_4p3 = _mm512_mullo_epi32( selector1, c_int32_4p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,4,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,4,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+		// c[3,48-63]
+		c_int32_3p3 = _mm512_add_epi32( a_int32_1, c_int32_3p3 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+		// c[4, 16-31]
+		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+		// c[4,32-47]
+		c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
+
+		// c[4,48-63]
+		c_int32_4p3 = _mm512_add_epi32( a_int32_1, c_int32_4p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+		// c[3,48-63]
+		c_int32_3p3 = _mm512_max_epi32( selector1, c_int32_3p3 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+		// c[4,16-31]
+		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+		// c[4,32-47]
+		c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
+
+		// c[4,48-63]
+		c_int32_4p3 = _mm512_max_epi32( selector1, c_int32_4p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+		// c[3, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p3)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+		// c[4, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
+
+		// c[4, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_2p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_3p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_4p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_4p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_2p3, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+		// c[3, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_3p3, y, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+		// c[4, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_4p2, y, r, x, x_erf)
+
+		// c[4, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_4p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_S32_AVX512(c_int32_2p3, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+		// c[3, 48-63]
+		CLIP_S32_AVX512(c_int32_3p3, min, max)
+
+		// c[4, 0-15]
+		CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+		// c[4, 32-47]
+		CLIP_S32_AVX512(c_int32_4p2, min, max)
+
+		// c[4, 48-63]
+		CLIP_S32_AVX512(c_int32_4p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[1, 48-63]
+		CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[2, 48-63]
+		CVT_MULRND_CVT32(c_int32_2p3,a_int32_1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		// c[3, 48-63]
+		CVT_MULRND_CVT32(c_int32_3p3,a_int32_1);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[4, 32-47]
+		CVT_MULRND_CVT32(c_int32_4p2,a_int32_0);
+
+		// c[4, 48-63]
+		CVT_MULRND_CVT32(c_int32_4p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[1,48-63]
+		CVT_STORE_S32_S8(c_int32_1p3,1,3);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[2,48-63]
+		CVT_STORE_S32_S8(c_int32_2p3,2,3);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[3,32-47]
+		CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+		// c[3,48-63]
+		CVT_STORE_S32_S8(c_int32_3p3,3,3);
+
+		// c[4,0-15]
+		CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+		// c[4,16-31]
+		CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+		// c[4,32-47]
+		CVT_STORE_S32_S8(c_int32_4p2,4,2);
+
+		// c[4,48-63]
+		CVT_STORE_S32_S8(c_int32_4p3,4,3);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
+
+		// c[3,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 );
+
+		// c[4,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
+
+		// c[4,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 );
+
+		// c[4,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 3*16 ), c_int32_4p3 );
+	}
+}
+
+// 4x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x64_DISABLE,
+						  &&POST_OPS_BIAS_4x64,
+						  &&POST_OPS_RELU_4x64,
+						  &&POST_OPS_RELU_SCALE_4x64,
+						  &&POST_OPS_GELU_TANH_4x64,
+						  &&POST_OPS_GELU_ERF_4x64,
+						  &&POST_OPS_CLIP_4x64,
+						  &&POST_OPS_DOWNSCALE_4x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+	__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+	__m512i c_int32_2p3 = _mm512_setzero_epi32();
+	
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+	__m512i c_int32_3p2 = _mm512_setzero_epi32();
+	__m512i c_int32_3p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-63] = a[3,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_1, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_1, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_1, b2 );
+		c_int32_3p3 = _mm512_dpbusd_epi32( c_int32_3p3, a_int32_1, b3 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+		c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+		c_int32_3p3 = _mm512_mullo_epi32( selector1, c_int32_3p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,3,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,3,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+		// c[3,48-63]
+		c_int32_3p3 = _mm512_add_epi32( a_int32_1, c_int32_3p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+		// c[3,48-63]
+		c_int32_3p3 = _mm512_max_epi32( selector1, c_int32_3p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+		// c[3, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_2p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_3p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_2p3, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+		// c[3, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_3p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_S32_AVX512(c_int32_2p3, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+		// c[3, 48-63]
+		CLIP_S32_AVX512(c_int32_3p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[1, 48-63]
+		CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[2, 48-63]
+		CVT_MULRND_CVT32(c_int32_2p3,a_int32_1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		// c[3, 48-63]
+		CVT_MULRND_CVT32(c_int32_3p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[1,48-63]
+		CVT_STORE_S32_S8(c_int32_1p3,1,3);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[2,48-63]
+		CVT_STORE_S32_S8(c_int32_2p3,2,3);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[3,32-47]
+		CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+		// c[3,48-63]
+		CVT_STORE_S32_S8(c_int32_3p3,3,3);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
+
+		// c[3,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 3*16 ), c_int32_3p3 );
+	}
+}
+
+// 3x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x64_DISABLE,
+						  &&POST_OPS_BIAS_3x64,
+						  &&POST_OPS_RELU_3x64,
+						  &&POST_OPS_RELU_SCALE_3x64,
+						  &&POST_OPS_GELU_TANH_3x64,
+						  &&POST_OPS_GELU_ERF_3x64,
+						  &&POST_OPS_CLIP_3x64,
+						  &&POST_OPS_DOWNSCALE_3x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+	__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+	__m512i c_int32_2p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a *  0 ) + ( cs_a * kr ) ) );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-63] = a[2,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+		c_int32_2p3 = _mm512_dpbusd_epi32( c_int32_2p3, a_int32_0, b3 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+		c_int32_2p3 = _mm512_mullo_epi32( selector1, c_int32_2p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,2,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,2,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_add_epi32( a_int32_1, c_int32_2p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[2,48-63]
+		c_int32_2p3 = _mm512_max_epi32( selector1, c_int32_2p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[2, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_2p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[2, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_2p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[2, 48-63]
+		CLIP_S32_AVX512(c_int32_2p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[1, 48-63]
+		CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[2, 48-63]
+		CVT_MULRND_CVT32(c_int32_2p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[1,48-63]
+		CVT_STORE_S32_S8(c_int32_1p3,1,3);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[2,48-63]
+		CVT_STORE_S32_S8(c_int32_2p3,2,3);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[2,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 3*16 ), c_int32_2p3 );
+	}
+}
+
+// 2x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x64_DISABLE,
+						  &&POST_OPS_BIAS_2x64,
+						  &&POST_OPS_RELU_2x64,
+						  &&POST_OPS_RELU_SCALE_2x64,
+						  &&POST_OPS_GELU_TANH_2x64,
+						  &&POST_OPS_GELU_ERF_2x64,
+						  &&POST_OPS_CLIP_2x64,
+						  &&POST_OPS_DOWNSCALE_2x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+	__m512i c_int32_1p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-63] = a[1,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_1, b2 );
+		c_int32_1p3 = _mm512_dpbusd_epi32( c_int32_1p3, a_int32_1, b3 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+		c_int32_1p3 = _mm512_mullo_epi32( selector1, c_int32_1p3 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,1,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,1,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_add_epi32( a_int32_1, c_int32_1p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[1,48-63]
+		c_int32_1p3 = _mm512_max_epi32( selector1, c_int32_1p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[1, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_1p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[1, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_1p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[1, 48-63]
+		CLIP_S32_AVX512(c_int32_1p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[1, 48-63]
+		CVT_MULRND_CVT32(c_int32_1p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[1,48-63]
+		CVT_STORE_S32_S8(c_int32_1p3,1,3);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[1,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 3*16 ), c_int32_1p3 );
+	}
+}
+
+// 1x64 int8o32 kernel
+LPGEMM_M_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x64)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x64_DISABLE,
+						  &&POST_OPS_BIAS_1x64,
+						  &&POST_OPS_RELU_1x64,
+						  &&POST_OPS_RELU_SCALE_1x64,
+						  &&POST_OPS_GELU_TANH_1x64,
+						  &&POST_OPS_GELU_ERF_1x64,
+						  &&POST_OPS_CLIP_1x64,
+						  &&POST_OPS_DOWNSCALE_1x64
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+	__m512i b3;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+
+	//  Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+	__m512i c_int32_0p3 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr]
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+                // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+		b3 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 3 ) );
+
+		// Perform column direction mat-mul with k = 4.
+                // c[0,0-63] = a[0,kr:kr+4]*b[kr:kr+4,0-63]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+		c_int32_0p3 = _mm512_dpbusd_epi32( c_int32_0p3, a_int32_0, b3 );
+	}
+	
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+		c_int32_0p3 = _mm512_mullo_epi32( selector1, c_int32_0p3 );
+	}
+	
+	// Scale C by beta.
+	if ( beta != 0)
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S8_S32_BETA_OP4(0,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47,48-63]
+			S32_S32_BETA_OP4(0,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x64:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_add_epi32( a_int32_1, c_int32_0p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[0,48-63]
+		c_int32_0p3 = _mm512_max_epi32( selector1, c_int32_0p3 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x64:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[0, 48-63]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p3)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x64:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 48-63]
+		GELU_TANH_S32_AVX512(c_int32_0p3, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x64:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[0, 48-63]
+		GELU_ERF_S32_AVX512(c_int32_0p3, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x64:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[0, 48-63]
+		CLIP_S32_AVX512(c_int32_0p3, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x64:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+		a_int32_1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 3 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[0, 48-63]
+		CVT_MULRND_CVT32(c_int32_0p3,a_int32_1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x64_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[0,48-63]
+		CVT_STORE_S32_S8(c_int32_0p3,0,3);
+	}
+	else
+	{
+		// Store the accumulated results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[0,48-63]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 3*16 ), c_int32_0p3 );
+	}
+}
+#endif
diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c
new file mode 100644
index 0000000000..940d9e92fa
--- /dev/null
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_mn_fringe_amd512vnni.c
@@ -0,0 +1,7518 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_s32_kern_macros.h"
+#include "lpgemm_s32_memcpy_macros.h"
+
+// 5xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5xLT16_DISABLE,
+						  &&POST_OPS_BIAS_5xLT16,
+						  &&POST_OPS_RELU_5xLT16,
+						  &&POST_OPS_RELU_SCALE_5xLT16,
+						  &&POST_OPS_GELU_TANH_5xLT16,
+						  &&POST_OPS_GELU_ERF_5xLT16,
+						  &&POST_OPS_CLIP_5xLT16,
+						  &&POST_OPS_DOWNSCALE_5xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_3p0, 3, 0, \
+								selector1, selector2 );
+
+				// c[4,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_4p0, 4, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, 0, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, 0, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_3p0, 0, 3, 0, \
+								selector1, selector2);
+
+				// c[4,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_4p0, 0, 4, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_5xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_5xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_5xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_5xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_5xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_5xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			// c[3, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1);
+
+			// c[4, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_5xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 1 ), load_mask, c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 2 ), load_mask, c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 3 ), load_mask, c_int32_3p0 );
+
+			// c[4,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 4 ), load_mask, c_int32_4p0 );
+		}
+	}
+}
+
+// 4xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4xLT16_DISABLE,
+						  &&POST_OPS_BIAS_4xLT16,
+						  &&POST_OPS_RELU_4xLT16,
+						  &&POST_OPS_RELU_SCALE_4xLT16,
+						  &&POST_OPS_GELU_TANH_4xLT16,
+						  &&POST_OPS_GELU_ERF_4xLT16,
+						  &&POST_OPS_CLIP_4xLT16,
+						  &&POST_OPS_DOWNSCALE_4xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_3p0, 3, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, 0, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, 0, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_3p0, 0, 3, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_4xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_4xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_4xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_4xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_4xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_4xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			// c[3, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_4xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 1 ), load_mask, c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 2 ), load_mask, c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 3 ), load_mask, c_int32_3p0 );
+		}
+	}
+}
+
+// 3xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3xLT16_DISABLE,
+						  &&POST_OPS_BIAS_3xLT16,
+						  &&POST_OPS_RELU_3xLT16,
+						  &&POST_OPS_RELU_SCALE_3xLT16,
+						  &&POST_OPS_GELU_TANH_3xLT16,
+						  &&POST_OPS_GELU_ERF_3xLT16,
+						  &&POST_OPS_CLIP_3xLT16,
+						  &&POST_OPS_DOWNSCALE_3xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, 0, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, 0, 2, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_3xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_3xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_3xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_3xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_3xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_3xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_3xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 1 ), load_mask, c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 2 ), load_mask, c_int32_2p0 );
+		}
+	}
+}
+
+// 2xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2xLT16_DISABLE,
+						  &&POST_OPS_BIAS_2xLT16,
+						  &&POST_OPS_RELU_2xLT16,
+						  &&POST_OPS_RELU_SCALE_2xLT16,
+						  &&POST_OPS_GELU_TANH_2xLT16,
+						  &&POST_OPS_GELU_ERF_2xLT16,
+						  &&POST_OPS_CLIP_2xLT16,
+						  &&POST_OPS_DOWNSCALE_2xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, 0, 1, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_2xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_2xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_2xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_2xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_2xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_2xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_2xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 1 ), load_mask, c_int32_1p0 );
+		}
+	}
+}
+
+// 1xlt16 int8o32 fringe kernel
+LPGEMM_MN_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1xLT16_DISABLE,
+						  &&POST_OPS_BIAS_1xLT16,
+						  &&POST_OPS_RELU_1xLT16,
+						  &&POST_OPS_RELU_SCALE_1xLT16,
+						  &&POST_OPS_GELU_TANH_1xLT16,
+						  &&POST_OPS_GELU_ERF_1xLT16,
+						  &&POST_OPS_CLIP_1xLT16,
+						  &&POST_OPS_DOWNSCALE_1xLT16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, 0, 0, 0, \
+								selector1, selector2);
+			}
+		}
+
+		// Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_1xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_1xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_1xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_1xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_1xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_1xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_1xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * 0 ), load_mask, c_int32_0p0 );
+		}
+	}
+}
+
+// 5x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x16_DISABLE,
+						  &&POST_OPS_BIAS_5x16,
+						  &&POST_OPS_RELU_5x16,
+						  &&POST_OPS_RELU_SCALE_5x16,
+						  &&POST_OPS_GELU_TANH_5x16,
+						  &&POST_OPS_GELU_ERF_5x16,
+						  &&POST_OPS_CLIP_5x16,
+						  &&POST_OPS_DOWNSCALE_5x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S8_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S8_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+
+			// c[3:0-15]
+			S8_S32_BETA_OP(c_int32_3p0,0,3,0,selector1,selector2);
+
+			// c[4:0-15]
+			S8_S32_BETA_OP(c_int32_4p0,0,4,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S32_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S32_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+
+			// c[3:0-15]
+			S32_S32_BETA_OP(c_int32_3p0,0,3,0,selector1,selector2);
+
+			// c[4:0-15]
+			S32_S32_BETA_OP(c_int32_4p0,0,4,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[4, 0-15]
+		CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[4,0-15]
+		CVT_STORE_S32_S8(c_int32_4p0,4,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[4,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
+	}
+}
+
+// 4x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x16_DISABLE,
+						  &&POST_OPS_BIAS_4x16,
+						  &&POST_OPS_RELU_4x16,
+						  &&POST_OPS_RELU_SCALE_4x16,
+						  &&POST_OPS_GELU_TANH_4x16,
+						  &&POST_OPS_GELU_ERF_4x16,
+						  &&POST_OPS_CLIP_4x16,
+						  &&POST_OPS_DOWNSCALE_4x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S8_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S8_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+
+			// c[3:0-15]
+			S8_S32_BETA_OP(c_int32_3p0,0,3,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S32_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S32_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+
+			// c[3:0-15]
+			S32_S32_BETA_OP(c_int32_3p0,0,3,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+	}
+}
+
+// 3x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x16_DISABLE,
+						  &&POST_OPS_BIAS_3x16,
+						  &&POST_OPS_RELU_3x16,
+						  &&POST_OPS_RELU_SCALE_3x16,
+						  &&POST_OPS_GELU_TANH_3x16,
+						  &&POST_OPS_GELU_ERF_3x16,
+						  &&POST_OPS_CLIP_3x16,
+						  &&POST_OPS_DOWNSCALE_3x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S8_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S8_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S32_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+
+			// c[2:0-15]
+			S32_S32_BETA_OP(c_int32_2p0,0,2,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+	}
+}
+
+// 2x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x16_DISABLE,
+						  &&POST_OPS_BIAS_2x16,
+						  &&POST_OPS_RELU_2x16,
+						  &&POST_OPS_RELU_SCALE_2x16,
+						  &&POST_OPS_GELU_TANH_2x16,
+						  &&POST_OPS_GELU_ERF_2x16,
+						  &&POST_OPS_CLIP_2x16,
+						  &&POST_OPS_DOWNSCALE_2x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S8_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+
+			// c[1:0-15]
+			S32_S32_BETA_OP(c_int32_1p0,0,1,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+	}
+}
+
+// 1x16 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x16_DISABLE,
+						  &&POST_OPS_BIAS_1x16,
+						  &&POST_OPS_RELU_1x16,
+						  &&POST_OPS_RELU_SCALE_1x16,
+						  &&POST_OPS_GELU_TANH_1x16,
+						  &&POST_OPS_GELU_ERF_1x16,
+						  &&POST_OPS_CLIP_1x16,
+						  &&POST_OPS_DOWNSCALE_1x16
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		__m512i a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		__m512i b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		__m512i a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15]
+			S8_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15]
+			S32_S32_BETA_OP(c_int32_0p0,0,0,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x16:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x16:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x16:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x16:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x16:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x16_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+	}
+}
+
+// 5x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x32_DISABLE,
+						  &&POST_OPS_BIAS_5x32,
+						  &&POST_OPS_RELU_5x32,
+						  &&POST_OPS_RELU_SCALE_5x32,
+						  &&POST_OPS_GELU_TANH_5x32,
+						  &&POST_OPS_GELU_ERF_5x32,
+						  &&POST_OPS_CLIP_5x32,
+						  &&POST_OPS_DOWNSCALE_5x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_4p0 = _mm512_setzero_epi32();
+	__m512i c_int32_4p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+
+		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S8_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S8_S32_BETA_OP2(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31]
+			S8_S32_BETA_OP2(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31]
+			S8_S32_BETA_OP2(0,4,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S32_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S32_S32_BETA_OP2(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31]
+			S32_S32_BETA_OP2(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31]
+			S32_S32_BETA_OP2(0,4,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+		// c[4, 16-31]
+		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+		// c[4,16-31]
+		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[4, 0-15]
+		CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[4,0-15]
+		CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+		// c[4,16-31]
+		CVT_STORE_S32_S8(c_int32_4p1,4,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[4,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
+	}
+}
+
+// 4x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x32_DISABLE,
+						  &&POST_OPS_BIAS_4x32,
+						  &&POST_OPS_RELU_4x32,
+						  &&POST_OPS_RELU_SCALE_4x32,
+						  &&POST_OPS_GELU_TANH_4x32,
+						  &&POST_OPS_GELU_ERF_4x32,
+						  &&POST_OPS_CLIP_4x32,
+						  &&POST_OPS_DOWNSCALE_4x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S8_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S8_S32_BETA_OP2(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31]
+			S8_S32_BETA_OP2(0,3,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S32_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S32_S32_BETA_OP2(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31]
+			S32_S32_BETA_OP2(0,3,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+	}
+}
+
+// 3x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x32_DISABLE,
+						  &&POST_OPS_BIAS_3x32,
+						  &&POST_OPS_RELU_3x32,
+						  &&POST_OPS_RELU_SCALE_3x32,
+						  &&POST_OPS_GELU_TANH_3x32,
+						  &&POST_OPS_GELU_ERF_3x32,
+						  &&POST_OPS_CLIP_3x32,
+						  &&POST_OPS_DOWNSCALE_3x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S8_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S8_S32_BETA_OP2(0,2,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S32_S32_BETA_OP2(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31]
+			S32_S32_BETA_OP2(0,2,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+	}
+}
+
+// 2x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x32_DISABLE,
+						  &&POST_OPS_BIAS_2x32,
+						  &&POST_OPS_RELU_2x32,
+						  &&POST_OPS_RELU_SCALE_2x32,
+						  &&POST_OPS_GELU_TANH_2x32,
+						  &&POST_OPS_GELU_ERF_2x32,
+						  &&POST_OPS_CLIP_2x32,
+						  &&POST_OPS_DOWNSCALE_2x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S8_S32_BETA_OP2(0,1,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31]
+			S32_S32_BETA_OP2(0,1,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+	}
+}
+
+// 1x32 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x32_DISABLE,
+						  &&POST_OPS_BIAS_1x32,
+						  &&POST_OPS_RELU_1x32,
+						  &&POST_OPS_RELU_SCALE_1x32,
+						  &&POST_OPS_GELU_TANH_1x32,
+						  &&POST_OPS_GELU_ERF_1x32,
+						  &&POST_OPS_CLIP_1x32,
+						  &&POST_OPS_DOWNSCALE_1x32
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31]
+			S8_S32_BETA_OP2(0,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31]
+			S32_S32_BETA_OP2(0,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x32:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x32:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x32:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x32:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x32:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x32_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+	}
+}
+
+// 5x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_5x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_5x48_DISABLE,
+						  &&POST_OPS_BIAS_5x48,
+						  &&POST_OPS_RELU_5x48,
+						  &&POST_OPS_RELU_SCALE_5x48,
+						  &&POST_OPS_GELU_TANH_5x48,
+						  &&POST_OPS_GELU_ERF_5x48,
+						  &&POST_OPS_CLIP_5x48,
+						  &&POST_OPS_DOWNSCALE_5x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+	__m512i c_int32_3p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_4p0 = _mm512_setzero_epi32();
+	__m512i c_int32_4p1 = _mm512_setzero_epi32();
+	__m512i c_int32_4p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+
+		// Broadcast a[4,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+		c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+		c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+
+		c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+		c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+		c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,4,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,3,selector1,selector2);
+
+			// c[4:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,4,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_5x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+		// c[4, 16-31]
+		c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+		// c[4,32-47]
+		c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_5x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+		// c[4,0-15]
+		c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+		// c[4,16-31]
+		c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+		// c[4,32-47]
+		c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_5x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+		// c[4, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+		// c[4, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+		// c[4, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_5x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[4, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_4p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_5x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+		// c[4, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+		// c[4, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+		// c[4, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_4p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_5x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+		// c[4, 0-15]
+		CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+		// c[4, 16-31]
+		CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+		// c[4, 32-47]
+		CLIP_S32_AVX512(c_int32_4p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_5x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[4, 32-47]
+		CVT_MULRND_CVT32(c_int32_4p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_5x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[3,32-47]
+		CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+		// c[4,0-15]
+		CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+		// c[4,16-31]
+		CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+		// c[4,32-47]
+		CVT_STORE_S32_S8(c_int32_4p2,4,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
+
+		// c[4,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 0*16 ), c_int32_4p0 );
+
+		// c[4,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 1*16 ), c_int32_4p1 );
+
+		// c[4,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 4 ) + ( 2*16 ), c_int32_4p2 );
+	}
+}
+
+// 4x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_4x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_4x48_DISABLE,
+						  &&POST_OPS_BIAS_4x48,
+						  &&POST_OPS_RELU_4x48,
+						  &&POST_OPS_RELU_SCALE_4x48,
+						  &&POST_OPS_GELU_TANH_4x48,
+						  &&POST_OPS_GELU_ERF_4x48,
+						  &&POST_OPS_CLIP_4x48,
+						  &&POST_OPS_DOWNSCALE_4x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_3p0 = _mm512_setzero_epi32();
+	__m512i c_int32_3p1 = _mm512_setzero_epi32();
+	__m512i c_int32_3p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+		// Broadcast a[3,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+		c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+		c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+
+		c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+		c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+		c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,3,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,2,selector1,selector2);
+
+			// c[3:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,3,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_4x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+		// c[3, 16-31]
+		c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_4x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		// c[3,0-15]
+		c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+		// c[3,16-31]
+		c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+		// c[3,32-47]
+		c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_4x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		// c[3, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+		// c[3, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+		// c[3, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_4x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[3, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_4x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		// c[3, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+		// c[3, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+		// c[3, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_4x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		// c[3, 0-15]
+		CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+		// c[3, 16-31]
+		CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+		// c[3, 32-47]
+		CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_4x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_4x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+		// c[3,0-15]
+		CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+		// c[3,16-31]
+		CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+		// c[3,32-47]
+		CVT_STORE_S32_S8(c_int32_3p2,3,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+
+		// c[3,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 0*16 ), c_int32_3p0 );
+
+		// c[3,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 1*16 ), c_int32_3p1 );
+
+		// c[3,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 3 ) + ( 2*16 ), c_int32_3p2 );
+	}
+}
+
+// 3x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_3x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_3x48_DISABLE,
+						  &&POST_OPS_BIAS_3x48,
+						  &&POST_OPS_RELU_3x48,
+						  &&POST_OPS_RELU_SCALE_3x48,
+						  &&POST_OPS_GELU_TANH_3x48,
+						  &&POST_OPS_GELU_ERF_3x48,
+						  &&POST_OPS_CLIP_3x48,
+						  &&POST_OPS_DOWNSCALE_3x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_2p0 = _mm512_setzero_epi32();
+	__m512i c_int32_2p1 = _mm512_setzero_epi32();
+	__m512i c_int32_2p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+		// Broadcast a[2,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+		c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+		c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+
+		c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+		c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+		c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,2,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,1,selector1,selector2);
+
+			// c[2:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,2,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_3x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+		// c[2, 16-31]
+		c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_3x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		// c[2,0-15]
+		c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+		// c[2,16-31]
+		c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+		// c[2,32-47]
+		c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_3x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		// c[2, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+		// c[2, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+		// c[2, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_3x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[2, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_3x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		// c[2, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+		// c[2, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+		// c[2, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_3x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		// c[2, 0-15]
+		CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+		// c[2, 16-31]
+		CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+		// c[2, 32-47]
+		CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_3x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_3x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+		// c[2,0-15]
+		CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+		// c[2,16-31]
+		CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+		// c[2,32-47]
+		CVT_STORE_S32_S8(c_int32_2p2,2,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+
+		// c[2,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 0*16 ), c_int32_2p0 );
+
+		// c[2,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 1*16 ), c_int32_2p1 );
+
+		// c[2,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 2 ) + ( 2*16 ), c_int32_2p2 );
+	}
+}
+
+// 2x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_2x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_2x48_DISABLE,
+						  &&POST_OPS_BIAS_2x48,
+						  &&POST_OPS_RELU_2x48,
+						  &&POST_OPS_RELU_SCALE_2x48,
+						  &&POST_OPS_GELU_TANH_2x48,
+						  &&POST_OPS_GELU_ERF_2x48,
+						  &&POST_OPS_CLIP_2x48,
+						  &&POST_OPS_DOWNSCALE_2x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	__m512i c_int32_1p0 = _mm512_setzero_epi32();
+	__m512i c_int32_1p1 = _mm512_setzero_epi32();
+	__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+		// Broadcast a[1,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+		c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+		c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+		c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+		c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+		c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,1,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+
+			// c[1:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,1,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_2x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+		// c[1, 16-31]
+		c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_2x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		// c[1,0-15]
+		c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+		// c[1,16-31]
+		c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+		// c[1,32-47]
+		c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_2x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		// c[1, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+		// c[1, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+		// c[1, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_2x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[1, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_2x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		// c[1, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+		// c[1, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+		// c[1, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_2x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		// c[1, 0-15]
+		CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+		// c[1, 16-31]
+		CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+		// c[1, 32-47]
+		CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_2x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_2x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+		// c[1,0-15]
+		CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+		// c[1,16-31]
+		CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+		// c[1,32-47]
+		CVT_STORE_S32_S8(c_int32_1p2,1,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+
+		// c[1,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 0*16 ), c_int32_1p0 );
+
+		// c[1,16-31]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 1*16 ), c_int32_1p1 );
+
+		// c[1,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 1 ) + ( 2*16 ), c_int32_1p2 );
+	}
+}
+
+// 1x48 int8o32 kernel
+LPGEMM_MN_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_1x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_1x48_DISABLE,
+						  &&POST_OPS_BIAS_1x48,
+						  &&POST_OPS_RELU_1x48,
+						  &&POST_OPS_RELU_SCALE_1x48,
+						  &&POST_OPS_GELU_TANH_1x48,
+						  &&POST_OPS_GELU_ERF_1x48,
+						  &&POST_OPS_CLIP_1x48,
+						  &&POST_OPS_DOWNSCALE_1x48
+						};
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	// Registers to use for accumulating C.
+	__m512i c_int32_0p0 = _mm512_setzero_epi32();
+	__m512i c_int32_0p1 = _mm512_setzero_epi32();
+	__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+	for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+	{
+		b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+	}
+	// Handle k remainder.
+	if ( k_partial_pieces > 0 )
+	{
+		__m128i a_kfringe_buf;
+		__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+		b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+		b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+		b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+		// Broadcast a[0,kr:kr+4].
+		a_kfringe_buf = _mm_maskz_loadu_epi8
+		(
+		  load_mask,
+		  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+		);
+		a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+		// Perform column direction mat-mul with k = 4.
+		// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+		c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+		c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+		c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+	}
+
+	// Load alpha and beta
+	__m512i selector1 = _mm512_set1_epi32( alpha );
+	__m512i selector2 = _mm512_set1_epi32( beta );
+
+	if ( alpha != 1 )
+	{
+		// Scale by alpha
+		c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+		c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+		c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+	}
+
+	// Scale C by beta.
+	if ( beta != 0 )
+	{
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_first_k == TRUE ) )
+		{
+			// c[0:0-15,16-31,32-47]
+			S8_S32_BETA_OP3(0,0,selector1,selector2);
+		}
+		else
+		{
+			// c[0:0-15,16-31,32-47]
+			S32_S32_BETA_OP3(0,0,selector1,selector2);
+		}
+	}
+
+	// Post Ops
+	lpgemm_post_op* post_ops_list_temp = post_ops_list;
+	POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_1x48:
+	{
+		selector1 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+				_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+								post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_1x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+
+		// c[0,0-15]
+		c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+		// c[0, 16-31]
+		c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+		// c[0,32-47]
+		c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_RELU_SCALE_1x48:
+	{
+		selector1 = _mm512_setzero_epi32();
+		selector2 =
+			_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+		__mmask16 relu_cmp_mask;
+
+		// c[0, 0-15]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+		// c[0, 16-31]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+		// c[0, 32-47]
+		RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_TANH_1x48:
+	{
+		__m512 dn, z, x, r2, r, y, x_tanh;
+		__m512i q;
+
+		// c[0, 0-15]
+		GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 16-31]
+		GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+		// c[0, 32-47]
+		GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_GELU_ERF_1x48:
+	{
+		__m512 x, r, y, x_erf;
+
+		// c[0, 0-15]
+		GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+		// c[0, 16-31]
+		GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+		// c[0, 32-47]
+		GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_CLIP_1x48:
+	{
+		__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+		__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+		// c[0, 0-15]
+		CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+		// c[0, 16-31]
+		CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+		// c[0, 32-47]
+		CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+
+POST_OPS_DOWNSCALE_1x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_1x48_DISABLE:
+	;
+
+	if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+	{
+		// Generate a mask16 of all 1's.
+		selector1 = _mm512_setzero_epi32();
+		selector2 = _mm512_set1_epi32( 10 );
+		__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+		// Store the results in downscaled type (int8 instead of int32).
+		// c[0,0-15]
+		CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+		// c[0,16-31]
+		CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+		// c[0,32-47]
+		CVT_STORE_S32_S8(c_int32_0p2,0,2);
+	}
+	else
+	{
+		// Store the results.
+		// c[0,0-15]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 0*16 ), c_int32_0p0 );
+
+		// c[0, 16-31]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 1*16 ), c_int32_0p1 );
+
+		// c[0,32-47]
+		_mm512_storeu_si512( c + ( rs_c * 0 ) + ( 2*16 ), c_int32_0p2 );
+	}
+}
+#endif
diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c
new file mode 100644
index 0000000000..f59c82721c
--- /dev/null
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_extMR_fringe_amd512vnni.c
@@ -0,0 +1,2770 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_s32_kern_macros.h"
+#include "lpgemm_s32_memcpy_macros.h"
+
+// This file contains micro-kernels with extended MR for n fringe kernels.
+// It was observed that increasing MR resulted in better multi-thread
+// performance for inputs predominantly calling n fringe kernels. However
+// slight regressions were observed in single thread performance.
+
+// 12xlt16 int8o32 fringe kernel
+__attribute__((aligned(64)))
+LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12xlt16)
+{
+	static void* post_ops_labels[] =
+				{
+				  &&POST_OPS_12xLT16_DISABLE,
+				  &&POST_OPS_BIAS_12xLT16,
+				  &&POST_OPS_RELU_12xLT16,
+				  &&POST_OPS_RELU_SCALE_12xLT16,
+				  &&POST_OPS_GELU_TANH_12xLT16,
+				  &&POST_OPS_GELU_ERF_12xLT16,
+				  &&POST_OPS_CLIP_12xLT16,
+				  &&POST_OPS_DOWNSCALE_12xLT16
+				};
+	dim_t MR = 12;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+	__m512i a_int32_2;
+	__m512i a_int32_3;
+	__m512i a_int32_4;
+	__m512i a_int32_5;
+	__m512i a_int32_6;
+	__m512i a_int32_7;
+	__m512i a_int32_8;
+	__m512i a_int32_9;
+	__m512i a_int32_10;
+	__m512i a_int32_11;
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		_mm_prefetch( b, _MM_HINT_T0 );
+		_mm_prefetch( a + ( MR * ps_a ) + ( 0 * 16 ), _MM_HINT_T1 );
+
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_6p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_7p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_8p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_9p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_10p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_11p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 16 extended elements each from B to 1 ZMM
+			// registers. It is to be noted that the B matrix is packed for use
+			// in vnni instructions and each load to ZMM register will have 4
+			// elements along k direction and 16 elements across n directions,
+			// so 4x16 elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_2 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_3 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_4 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_5 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[6,kr:kr+4].
+			a_int32_6 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 6 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[7,kr:kr+4].
+			a_int32_7 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 7 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
+			// Broadcast a[8,kr:kr+4].
+			a_int32_8 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 8 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[9,kr:kr+4].
+			a_int32_9 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 9 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[10,kr:kr+4].
+			a_int32_10 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 10 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[11,kr:kr+4].
+			a_int32_11 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 11 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[6,0-15] = a[6,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_6p0 = _mm512_dpbusd_epi32( c_int32_6p0, a_int32_6, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[7,0-15] = a[7,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_7p0 = _mm512_dpbusd_epi32( c_int32_7p0, a_int32_7, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[8,0-15] = a[8,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_8p0 = _mm512_dpbusd_epi32( c_int32_8p0, a_int32_8, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[9,0-15] = a[9,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_9p0 = _mm512_dpbusd_epi32( c_int32_9p0, a_int32_9, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[10,0-15] = a[10,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_10p0 = _mm512_dpbusd_epi32( c_int32_10p0, a_int32_10, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[11,0-15] = a[11,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_11p0 = _mm512_dpbusd_epi32( c_int32_11p0, a_int32_11, b0 );
+		}
+		__asm__(".p2align 6\n");
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_3 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_4 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_5 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+
+			// Broadcast a[6,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 6 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_6 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[6,0-15] = a[6,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_6p0 = _mm512_dpbusd_epi32( c_int32_6p0, a_int32_6, b0 );
+
+			// Broadcast a[7,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 7 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_7 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[7,0-15] = a[7,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_7p0 = _mm512_dpbusd_epi32( c_int32_7p0, a_int32_7, b0 );
+
+			// Broadcast a[8,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 8 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_8 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[8,0-15] = a[8,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_8p0 = _mm512_dpbusd_epi32( c_int32_8p0, a_int32_8, b0 );
+
+			// Broadcast a[9,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 9 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_9 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[9,0-15] = a[9,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_9p0 = _mm512_dpbusd_epi32( c_int32_9p0, a_int32_9, b0 );
+
+			// Broadcast a[10,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 10 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_10 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[10,0-15] = a[10,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_10p0 = _mm512_dpbusd_epi32( c_int32_10p0, a_int32_10, b0 );
+
+			// Broadcast a[11,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 11 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_11 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[11,0-15] = a[11,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_11p0 = _mm512_dpbusd_epi32( c_int32_11p0, a_int32_11, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+
+			c_int32_6p0 = _mm512_mullo_epi32( selector1, c_int32_6p0 );
+
+			c_int32_7p0 = _mm512_mullo_epi32( selector1, c_int32_7p0 );
+
+			c_int32_8p0 = _mm512_mullo_epi32( selector1, c_int32_8p0 );
+
+			c_int32_9p0 = _mm512_mullo_epi32( selector1, c_int32_9p0 );
+
+			c_int32_10p0 = _mm512_mullo_epi32( selector1, c_int32_10p0 );
+
+			c_int32_11p0 = _mm512_mullo_epi32( selector1, c_int32_11p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_3p0, 3, 0, \
+								selector1, selector2 );
+
+				// c[4,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_4p0, 4, 0, \
+								selector1, selector2 );
+
+				// c[5,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_5p0, 5, 0, \
+								selector1, selector2 );
+
+				// c[6,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_6p0, 6, 0, \
+								selector1, selector2 );
+
+				// c[7,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_7p0, 7, 0, \
+								selector1, selector2 );
+
+				// c[8,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_8p0, 8, 0, \
+								selector1, selector2 );
+
+				// c[9,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_9p0, 9, 0, \
+								selector1, selector2 );
+
+				// c[10,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_10p0, 10, 0, \
+								selector1, selector2 );
+
+				// c[11,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_11p0, 11, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, ir, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, ir, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, ir, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_3p0, ir, 3, 0, \
+								selector1, selector2);
+
+				// c[4,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_4p0, ir, 4, 0, \
+								selector1, selector2);
+
+				// c[5,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_5p0, ir, 5, 0, \
+								selector1, selector2);
+
+				// c[6,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_6p0, ir, 6, 0, \
+								selector1, selector2);
+
+				// c[7,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_7p0, ir, 7, 0, \
+								selector1, selector2);
+
+				// c[8,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_8p0, ir, 8, 0, \
+								selector1, selector2);
+
+				// c[9,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_9p0, ir, 9, 0, \
+								selector1, selector2);
+
+				// c[10,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_10p0, ir, 10, 0, \
+								selector1, selector2);
+
+				// c[11,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_11p0, ir, 11, 0, \
+								selector1, selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_12xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			// c[6,0-15]
+			c_int32_6p0 = _mm512_add_epi32( selector1, c_int32_6p0 );
+
+			// c[7,0-15]
+			c_int32_7p0 = _mm512_add_epi32( selector1, c_int32_7p0 );
+
+			// c[8,0-15]
+			c_int32_8p0 = _mm512_add_epi32( selector1, c_int32_8p0 );
+
+			// c[9,0-15]
+			c_int32_9p0 = _mm512_add_epi32( selector1, c_int32_9p0 );
+
+			// c[10,0-15]
+			c_int32_10p0 = _mm512_add_epi32( selector1, c_int32_10p0 );
+
+			// c[11,0-15]
+			c_int32_11p0 = _mm512_add_epi32( selector1, c_int32_11p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_12xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			// c[6,0-15]
+			c_int32_6p0 = _mm512_max_epi32( selector1, c_int32_6p0 );
+
+			// c[7,0-15]
+			c_int32_7p0 = _mm512_max_epi32( selector1, c_int32_7p0 );
+
+			// c[8,0-15]
+			c_int32_8p0 = _mm512_max_epi32( selector1, c_int32_8p0 );
+
+			// c[9,0-15]
+			c_int32_9p0 = _mm512_max_epi32( selector1, c_int32_9p0 );
+
+			// c[10,0-15]
+			c_int32_10p0 = _mm512_max_epi32( selector1, c_int32_10p0 );
+
+			// c[11,0-15]
+			c_int32_11p0 = _mm512_max_epi32( selector1, c_int32_11p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_12xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			// c[6, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_6p0)
+
+			// c[7, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_7p0)
+
+			// c[8, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_8p0)
+
+			// c[9, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_9p0)
+
+			// c[10, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_10p0)
+
+			// c[11, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_11p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_12xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[6, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_6p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[7, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_7p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[8, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_8p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[9, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_9p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[10, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_10p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[11, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_11p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_12xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			// c[6, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_6p0, y, r, x, x_erf)
+
+			// c[7, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_7p0, y, r, x, x_erf)
+
+			// c[8, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_8p0, y, r, x, x_erf)
+
+			// c[9, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_9p0, y, r, x, x_erf)
+
+			// c[10, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_10p0, y, r, x, x_erf)
+
+			// c[11, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_11p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_12xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[6, 0-15]
+			CLIP_S32_AVX512(c_int32_6p0, min, max)
+
+			// c[7, 0-15]
+			CLIP_S32_AVX512(c_int32_7p0, min, max)
+
+			// c[8, 0-15]
+			CLIP_S32_AVX512(c_int32_8p0, min, max)
+
+			// c[9, 0-15]
+			CLIP_S32_AVX512(c_int32_9p0, min, max)
+
+			// c[10, 0-15]
+			CLIP_S32_AVX512(c_int32_10p0, min, max)
+
+			// c[11, 0-15]
+			CLIP_S32_AVX512(c_int32_11p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_12xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			// c[3, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1);
+
+			// c[4, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1);
+
+			// c[5, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1);
+
+			// c[6, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_6p0,selector1);
+
+			// c[7, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_7p0,selector1);
+
+			// c[8, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_8p0,selector1);
+
+			// c[9, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_9p0,selector1);
+
+			// c[10, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_10p0,selector1);
+
+			// c[11, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_11p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_12xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) &&
+			 ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[6,0-15]
+			CVT_STORE_S32_S8(c_int32_6p0,6,0);
+
+			// c[7,0-15]
+			CVT_STORE_S32_S8(c_int32_7p0,7,0);
+
+			// c[8,0-15]
+			CVT_STORE_S32_S8(c_int32_8p0,8,0);
+
+			// c[9,0-15]
+			CVT_STORE_S32_S8(c_int32_9p0,9,0);
+
+			// c[10,0-15]
+			CVT_STORE_S32_S8(c_int32_10p0,10,0);
+
+			// c[11,0-15]
+			CVT_STORE_S32_S8(c_int32_11p0,11,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 0 ) ), load_mask, c_int32_0p0
+			);
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 1 ) ), load_mask, c_int32_1p0
+			);
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 2 ) ), load_mask, c_int32_2p0
+			);
+
+			// c[3,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 3 ) ), load_mask, c_int32_3p0
+			);
+
+			// c[4,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 4 ) ), load_mask, c_int32_4p0
+			);
+
+			// c[5,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 5 ) ), load_mask, c_int32_5p0
+			);
+
+			// c[6,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 6 ) ), load_mask, c_int32_6p0
+			);
+
+			// c[7,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 7 ) ), load_mask, c_int32_7p0
+			);
+
+			// c[8,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 8 ) ), load_mask, c_int32_8p0
+			);
+
+			// c[9,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 9 ) ), load_mask, c_int32_9p0
+			);
+
+			// c[10,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 10 ) ), load_mask, c_int32_10p0
+			);
+
+			// c[11,0-15]
+			_mm512_mask_storeu_epi32
+			(
+			  c + ( rs_c * ( ir + 11 ) ), load_mask, c_int32_11p0
+			);
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		lpgemm_rowvar_u8s8s32o32_6xlt16
+		(
+		  m_partial_pieces, k0,
+		  a, rs_a, cs_a, ps_a,
+		  b, rs_b, cs_b,
+		  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+		  alpha, beta, n0_rem,
+		  post_ops_list, post_ops_attr
+		);
+	}
+}
+
+// 12x16 int8o32 fringe kernel
+__attribute__((aligned(64)))
+LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_12x16)
+{
+	static void* post_ops_labels[] =
+				{
+				  &&POST_OPS_12x16_DISABLE,
+				  &&POST_OPS_BIAS_12x16,
+				  &&POST_OPS_RELU_12x16,
+				  &&POST_OPS_RELU_SCALE_12x16,
+				  &&POST_OPS_GELU_TANH_12x16,
+				  &&POST_OPS_GELU_ERF_12x16,
+				  &&POST_OPS_CLIP_12x16,
+				  &&POST_OPS_DOWNSCALE_12x16
+				};
+	dim_t MR = 12;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+	__m512i a_int32_2;
+	__m512i a_int32_3;
+	__m512i a_int32_4;
+	__m512i a_int32_5;
+	__m512i a_int32_6;
+	__m512i a_int32_7;
+	__m512i a_int32_8;
+	__m512i a_int32_9;
+	__m512i a_int32_10;
+	__m512i a_int32_11;
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		_mm_prefetch( b, _MM_HINT_T0 );
+		_mm_prefetch( a + ( MR * ps_a ) + ( 0 * 16 ), _MM_HINT_T1 );
+
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_6p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_7p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_8p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_9p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_10p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_11p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 16 elements each from B to 1 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_2 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_3 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_4 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_5 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_6 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 6 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_7 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 7 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_8 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 8 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_9 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 9 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_10 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 10 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_11 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 11 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[6,0-15] = a[6,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_6p0 = _mm512_dpbusd_epi32( c_int32_6p0, a_int32_6, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[7,0-15] = a[7,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_7p0 = _mm512_dpbusd_epi32( c_int32_7p0, a_int32_7, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[8,0-15] = a[8,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_8p0 = _mm512_dpbusd_epi32( c_int32_8p0, a_int32_8, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[9,0-15] = a[9,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_9p0 = _mm512_dpbusd_epi32( c_int32_9p0, a_int32_9, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[10,0-15] = a[10,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_10p0 = _mm512_dpbusd_epi32( c_int32_10p0, a_int32_10, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[11,0-15] = a[11,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_11p0 = _mm512_dpbusd_epi32( c_int32_11p0, a_int32_11, b0 );
+		}
+		__asm__(".p2align 6\n");
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_3 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_4 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_5 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+
+			// Broadcast a[6,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 6 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_6 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[6,0-15] = a[6,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_6p0 = _mm512_dpbusd_epi32( c_int32_6p0, a_int32_6, b0 );
+
+			// Broadcast a[7,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 7 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_7 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[7,0-15] = a[7,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_7p0 = _mm512_dpbusd_epi32( c_int32_7p0, a_int32_7, b0 );
+
+			// Broadcast a[8,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 8 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_8 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[8,0-15] = a[8,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_8p0 = _mm512_dpbusd_epi32( c_int32_8p0, a_int32_8, b0 );
+
+			// Broadcast a[9,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 9 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_9 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[9,0-15] = a[9,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_9p0 = _mm512_dpbusd_epi32( c_int32_9p0, a_int32_9, b0 );
+
+			// Broadcast a[10,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 10 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_10 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[10,0-15] = a[10,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_10p0 = _mm512_dpbusd_epi32( c_int32_10p0, a_int32_10, b0 );
+
+			// Broadcast a[11,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 11 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_11 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[11,0-15] = a[11,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_11p0 = _mm512_dpbusd_epi32( c_int32_11p0, a_int32_11, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+
+			c_int32_6p0 = _mm512_mullo_epi32( selector1, c_int32_6p0 );
+
+			c_int32_7p0 = _mm512_mullo_epi32( selector1, c_int32_7p0 );
+
+			c_int32_8p0 = _mm512_mullo_epi32( selector1, c_int32_8p0 );
+
+			c_int32_9p0 = _mm512_mullo_epi32( selector1, c_int32_9p0 );
+
+			c_int32_10p0 = _mm512_mullo_epi32( selector1, c_int32_10p0 );
+
+			c_int32_11p0 = _mm512_mullo_epi32( selector1, c_int32_11p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15]
+				S8_S32_BETA_OP(c_int32_0p0,ir,0,0,selector1,selector2);
+
+				// c[1:0-15]
+				S8_S32_BETA_OP(c_int32_1p0,ir,1,0,selector1,selector2);
+
+				// c[2:0-15]
+				S8_S32_BETA_OP(c_int32_2p0,ir,2,0,selector1,selector2);
+
+				// c[3:0-15]
+				S8_S32_BETA_OP(c_int32_3p0,ir,3,0,selector1,selector2);
+
+				// c[4:0-15]
+				S8_S32_BETA_OP(c_int32_4p0,ir,4,0,selector1,selector2);
+
+				// c[5:0-15]
+				S8_S32_BETA_OP(c_int32_5p0,ir,5,0,selector1,selector2);
+
+				// c[6:0-15]
+				S8_S32_BETA_OP(c_int32_6p0,ir,6,0,selector1,selector2);
+
+				// c[7:0-15]
+				S8_S32_BETA_OP(c_int32_7p0,ir,7,0,selector1,selector2);
+
+				// c[8:0-15]
+				S8_S32_BETA_OP(c_int32_8p0,ir,8,0,selector1,selector2);
+
+				// c[9:0-15]
+				S8_S32_BETA_OP(c_int32_9p0,ir,9,0,selector1,selector2);
+
+				// c[10:0-15]
+				S8_S32_BETA_OP(c_int32_10p0,ir,10,0,selector1,selector2);
+
+				// c[11:0-15]
+				S8_S32_BETA_OP(c_int32_11p0,ir,11,0,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15]
+				S32_S32_BETA_OP(c_int32_0p0,ir,0,0,selector1,selector2);
+
+				// c[1:0-15]
+				S32_S32_BETA_OP(c_int32_1p0,ir,1,0,selector1,selector2);
+
+				// c[2:0-15]
+				S32_S32_BETA_OP(c_int32_2p0,ir,2,0,selector1,selector2);
+
+				// c[3:0-15]
+				S32_S32_BETA_OP(c_int32_3p0,ir,3,0,selector1,selector2);
+
+				// c[4:0-15]
+				S32_S32_BETA_OP(c_int32_4p0,ir,4,0,selector1,selector2);
+
+				// c[5:0-15]
+				S32_S32_BETA_OP(c_int32_5p0,ir,5,0,selector1,selector2);
+
+				// c[6:0-15]
+				S32_S32_BETA_OP(c_int32_6p0,ir,6,0,selector1,selector2);
+
+				// c[7:0-15]
+				S32_S32_BETA_OP(c_int32_7p0,ir,7,0,selector1,selector2);
+
+				// c[8:0-15]
+				S32_S32_BETA_OP(c_int32_8p0,ir,8,0,selector1,selector2);
+
+				// c[9:0-15]
+				S32_S32_BETA_OP(c_int32_9p0,ir,9,0,selector1,selector2);
+
+				// c[10:0-15]
+				S32_S32_BETA_OP(c_int32_10p0,ir,10,0,selector1,selector2);
+
+				// c[11:0-15]
+				S32_S32_BETA_OP(c_int32_11p0,ir,11,0,selector1,selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_12x16:
+		{
+			selector1 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			// c[6,0-15]
+			c_int32_6p0 = _mm512_add_epi32( selector1, c_int32_6p0 );
+
+			// c[7,0-15]
+			c_int32_7p0 = _mm512_add_epi32( selector1, c_int32_7p0 );
+
+			// c[8,0-15]
+			c_int32_8p0 = _mm512_add_epi32( selector1, c_int32_8p0 );
+
+			// c[9,0-15]
+			c_int32_9p0 = _mm512_add_epi32( selector1, c_int32_9p0 );
+
+			// c[10,0-15]
+			c_int32_10p0 = _mm512_add_epi32( selector1, c_int32_10p0 );
+
+			// c[11,0-15]
+			c_int32_11p0 = _mm512_add_epi32( selector1, c_int32_11p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_12x16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			// c[6,0-15]
+			c_int32_6p0 = _mm512_max_epi32( selector1, c_int32_6p0 );
+
+			// c[7,0-15]
+			c_int32_7p0 = _mm512_max_epi32( selector1, c_int32_7p0 );
+
+			// c[8,0-15]
+			c_int32_8p0 = _mm512_max_epi32( selector1, c_int32_8p0 );
+
+			// c[9,0-15]
+			c_int32_9p0 = _mm512_max_epi32( selector1, c_int32_9p0 );
+
+			// c[10,0-15]
+			c_int32_10p0 = _mm512_max_epi32( selector1, c_int32_10p0 );
+
+			// c[11,0-15]
+			c_int32_11p0 = _mm512_max_epi32( selector1, c_int32_11p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_12x16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			// c[6, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_6p0)
+
+			// c[7, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_7p0)
+
+			// c[8, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_8p0)
+
+			// c[9, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_9p0)
+
+			// c[10, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_10p0)
+
+			// c[11, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_11p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_12x16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[6, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_6p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[7, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_7p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[8, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_8p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[9, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_9p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[10, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_10p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[11, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_11p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_12x16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			// c[6, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_6p0, y, r, x, x_erf)
+
+			// c[7, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_7p0, y, r, x, x_erf)
+
+			// c[8, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_8p0, y, r, x, x_erf)
+
+			// c[9, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_9p0, y, r, x, x_erf)
+
+			// c[10, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_10p0, y, r, x, x_erf)
+
+			// c[11, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_11p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_12x16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[6, 0-15]
+			CLIP_S32_AVX512(c_int32_6p0, min, max)
+
+			// c[7, 0-15]
+			CLIP_S32_AVX512(c_int32_7p0, min, max)
+
+			// c[8, 0-15]
+			CLIP_S32_AVX512(c_int32_8p0, min, max)
+
+			// c[9, 0-15]
+			CLIP_S32_AVX512(c_int32_9p0, min, max)
+
+			// c[10, 0-15]
+			CLIP_S32_AVX512(c_int32_10p0, min, max)
+
+			// c[11, 0-15]
+			CLIP_S32_AVX512(c_int32_11p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_12x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[5, 0-15]
+		CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+		// c[6, 0-15]
+		CVT_MULRND_CVT32(c_int32_6p0,selector1);
+
+		// c[7, 0-15]
+		CVT_MULRND_CVT32(c_int32_7p0,selector1);
+
+		// c[8, 0-15]
+		CVT_MULRND_CVT32(c_int32_8p0,selector1);
+
+		// c[9, 0-15]
+		CVT_MULRND_CVT32(c_int32_9p0,selector1);
+
+		// c[10, 0-15]
+		CVT_MULRND_CVT32(c_int32_10p0,selector1);
+
+		// c[11, 0-15]
+		CVT_MULRND_CVT32(c_int32_11p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_12x16_DISABLE:
+		;
+
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[6,0-15]
+			CVT_STORE_S32_S8(c_int32_6p0,6,0);
+
+			// c[7,0-15]
+			CVT_STORE_S32_S8(c_int32_7p0,7,0);
+
+			// c[8,0-15]
+			CVT_STORE_S32_S8(c_int32_8p0,8,0);
+
+			// c[9,0-15]
+			CVT_STORE_S32_S8(c_int32_9p0,9,0);
+
+			// c[10,0-15]
+			CVT_STORE_S32_S8(c_int32_10p0,10,0);
+
+			// c[11,0-15]
+			CVT_STORE_S32_S8(c_int32_11p0,11,0);
+		}
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+
+			// c[6,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 6 ) ) + ( 0*16 ), c_int32_6p0 );
+
+			// c[7,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 7 ) ) + ( 0*16 ), c_int32_7p0 );
+
+			// c[8,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 8 ) ) + ( 0*16 ), c_int32_8p0 );
+
+			// c[9,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 9 ) ) + ( 0*16 ), c_int32_9p0 );
+
+			// c[10,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 10 ) ) + ( 0*16 ), c_int32_10p0 );
+
+			// c[11,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 11 ) ) + ( 0*16 ), c_int32_11p0 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		lpgemm_rowvar_u8s8s32o32_6x16
+		(
+		  m_partial_pieces, k0,
+		  a, rs_a, cs_a, ps_a,
+		  b, rs_b, cs_b,
+		  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+		  alpha, beta,
+		  post_ops_list, post_ops_attr
+		);
+	}
+}
+
+// 9x32 int8o32 fringe kernel
+__attribute__((aligned(64)))
+LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_9x32)
+{
+	static void* post_ops_labels[] =
+				{
+				  &&POST_OPS_9x32_DISABLE,
+				  &&POST_OPS_BIAS_9x32,
+				  &&POST_OPS_RELU_9x32,
+				  &&POST_OPS_RELU_SCALE_9x32,
+				  &&POST_OPS_GELU_TANH_9x32,
+				  &&POST_OPS_GELU_ERF_9x32,
+				  &&POST_OPS_CLIP_9x32,
+				  &&POST_OPS_DOWNSCALE_9x32
+				};
+	dim_t MR = 9;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+	__m512i a_int32_2;
+	__m512i a_int32_3;
+
+	__m512i selector1;
+	__m512i selector2;
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		_mm_prefetch( b, _MM_HINT_T0 );
+		_mm_prefetch( a + ( MR * ps_a ) + ( 0 * 16 ), _MM_HINT_T1 );
+
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+		__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+		__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+		__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+		__m512i c_int32_3p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+		__m512i c_int32_4p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+		__m512i c_int32_5p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_6p0 = _mm512_setzero_epi32();
+		__m512i c_int32_6p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_7p0 = _mm512_setzero_epi32();
+		__m512i c_int32_7p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_8p0 = _mm512_setzero_epi32();
+		__m512i c_int32_8p1 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 32 elements each from B to 2 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_2 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_3 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[4,kr:kr+4].
+			selector1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[5,kr:kr+4].
+			selector2 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_2, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_3, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, selector1, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, selector1, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-31] = a[5,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, selector2, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, selector2, b1 );
+
+			// Broadcast a[6,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 6 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[7,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 7 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[8,kr:kr+4].
+			a_int32_2 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 8 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[6,0-31] = a[6,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_6p0 = _mm512_dpbusd_epi32( c_int32_6p0, a_int32_0, b0 );
+			c_int32_6p1 = _mm512_dpbusd_epi32( c_int32_6p1, a_int32_0, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[7,0-31] = a[7,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_7p0 = _mm512_dpbusd_epi32( c_int32_7p0, a_int32_1, b0 );
+			c_int32_7p1 = _mm512_dpbusd_epi32( c_int32_7p1, a_int32_1, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[8,0-31] = a[8,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_8p0 = _mm512_dpbusd_epi32( c_int32_8p0, a_int32_2, b0 );
+			c_int32_8p1 = _mm512_dpbusd_epi32( c_int32_8p1, a_int32_2, b1 );
+		}
+		__asm__(".p2align 6\n");
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_2, b1 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_3 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_3, b1 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			selector1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, selector1, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, selector1, b1 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			selector2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-31] = a[5,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, selector2, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, selector2, b1 );
+
+			// Broadcast a[6,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 6 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[6,0-31] = a[6,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_6p0 = _mm512_dpbusd_epi32( c_int32_6p0, a_int32_0, b0 );
+			c_int32_6p1 = _mm512_dpbusd_epi32( c_int32_6p1, a_int32_0, b1 );
+
+			// Broadcast a[7,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 7 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[7,0-31] = a[7,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_7p0 = _mm512_dpbusd_epi32( c_int32_7p0, a_int32_1, b0 );
+			c_int32_7p1 = _mm512_dpbusd_epi32( c_int32_7p1, a_int32_1, b1 );
+
+			// Broadcast a[8,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 8 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[8,0-31] = a[8,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_8p0 = _mm512_dpbusd_epi32( c_int32_8p0, a_int32_2, b0 );
+			c_int32_8p1 = _mm512_dpbusd_epi32( c_int32_8p1, a_int32_2, b1 );
+		}
+
+		// Load alpha and beta
+		selector1 = _mm512_set1_epi32( alpha );
+		selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+			c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+			c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+			c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+			c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+			c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+			c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
+
+			c_int32_6p0 = _mm512_mullo_epi32( selector1, c_int32_6p0 );
+			c_int32_6p1 = _mm512_mullo_epi32( selector1, c_int32_6p1 );
+
+			c_int32_7p0 = _mm512_mullo_epi32( selector1, c_int32_7p0 );
+			c_int32_7p1 = _mm512_mullo_epi32( selector1, c_int32_7p1 );
+
+			c_int32_8p0 = _mm512_mullo_epi32( selector1, c_int32_8p0 );
+			c_int32_8p1 = _mm512_mullo_epi32( selector1, c_int32_8p1 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15,16-31]
+				S8_S32_BETA_OP2(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31]
+				S8_S32_BETA_OP2(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31]
+				S8_S32_BETA_OP2(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31]
+				S8_S32_BETA_OP2(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31]
+				S8_S32_BETA_OP2(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31]
+				S8_S32_BETA_OP2(ir,5,selector1,selector2);
+
+				// c[6:0-15,16-31]
+				S8_S32_BETA_OP2(ir,6,selector1,selector2);
+
+				// c[7:0-15,16-31]
+				S8_S32_BETA_OP2(ir,7,selector1,selector2);
+
+				// c[8:0-15,16-31]
+				S8_S32_BETA_OP2(ir,8,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15,16-31]
+				S32_S32_BETA_OP2(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31]
+				S32_S32_BETA_OP2(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31]
+				S32_S32_BETA_OP2(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31]
+				S32_S32_BETA_OP2(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31]
+				S32_S32_BETA_OP2(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31]
+				S32_S32_BETA_OP2(ir,5,selector1,selector2);
+
+				// c[6:0-15,16-31]
+				S32_S32_BETA_OP2(ir,6,selector1,selector2);
+
+				// c[7:0-15,16-31]
+				S32_S32_BETA_OP2(ir,7,selector1,selector2);
+
+				// c[8:0-15,16-31]
+				S32_S32_BETA_OP2(ir,8,selector1,selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_9x32:
+		{
+			selector1 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[1, 16-31]
+			c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[2, 16-31]
+			c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[3, 16-31]
+			c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[4, 16-31]
+			c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			// c[5, 16-31]
+			c_int32_5p1 = _mm512_add_epi32( selector2, c_int32_5p1 );
+
+			// c[6,0-15]
+			c_int32_6p0 = _mm512_add_epi32( selector1, c_int32_6p0 );
+
+			// c[6, 16-31]
+			c_int32_6p1 = _mm512_add_epi32( selector2, c_int32_6p1 );
+
+			// c[7,0-15]
+			c_int32_7p0 = _mm512_add_epi32( selector1, c_int32_7p0 );
+
+			// c[7, 16-31]
+			c_int32_7p1 = _mm512_add_epi32( selector2, c_int32_7p1 );
+
+			// c[8,0-15]
+			c_int32_8p0 = _mm512_add_epi32( selector1, c_int32_8p0 );
+
+			// c[8, 16-31]
+			c_int32_8p1 = _mm512_add_epi32( selector2, c_int32_8p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_9x32:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[1,16-31]
+			c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[2,16-31]
+			c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[3,16-31]
+			c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[4,16-31]
+			c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			// c[5,16-31]
+			c_int32_5p1 = _mm512_max_epi32( selector1, c_int32_5p1 );
+
+			// c[6,0-15]
+			c_int32_6p0 = _mm512_max_epi32( selector1, c_int32_6p0 );
+
+			// c[6, 16-31]
+			c_int32_6p1 = _mm512_max_epi32( selector1, c_int32_6p1 );
+
+			// c[7,0-15]
+			c_int32_7p0 = _mm512_max_epi32( selector1, c_int32_7p0 );
+
+			// c[7,16-31]
+			c_int32_7p1 = _mm512_max_epi32( selector1, c_int32_7p1 );
+
+			// c[8,0-15]
+			c_int32_8p0 = _mm512_max_epi32( selector1, c_int32_8p0 );
+
+			// c[8,16-31]
+			c_int32_8p1 = _mm512_max_epi32( selector1, c_int32_8p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_9x32:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p1)
+
+			// c[6, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_6p0)
+
+			// c[6, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_6p1)
+
+			// c[7, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_7p0)
+
+			// c[7, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_7p1)
+
+			// c[8, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_8p0)
+
+			// c[8, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_8p1)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_9x32:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_5p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[6, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_6p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[6, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_6p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[7, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_7p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[7, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_7p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[8, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_8p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[8, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_8p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_9x32:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_5p1, y, r, x, x_erf)
+
+			// c[6, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_6p0, y, r, x, x_erf)
+
+			// c[6, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_6p1, y, r, x, x_erf)
+
+			// c[7, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_7p0, y, r, x, x_erf)
+
+			// c[7, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_7p1, y, r, x, x_erf)
+
+			// c[8, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_8p0, y, r, x, x_erf)
+
+			// c[8, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_8p1, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_9x32:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_S32_AVX512(c_int32_5p1, min, max)
+
+			// c[6, 0-15]
+			CLIP_S32_AVX512(c_int32_6p0, min, max)
+
+			// c[6, 16-31]
+			CLIP_S32_AVX512(c_int32_6p1, min, max)
+
+			// c[7, 0-15]
+			CLIP_S32_AVX512(c_int32_7p0, min, max)
+
+			// c[7, 16-31]
+			CLIP_S32_AVX512(c_int32_7p1, min, max)
+
+			// c[8, 0-15]
+			CLIP_S32_AVX512(c_int32_8p0, min, max)
+
+			// c[8, 16-31]
+			CLIP_S32_AVX512(c_int32_8p1, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_9x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[5, 0-15]
+		CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+		// c[5, 16-31]
+		CVT_MULRND_CVT32(c_int32_5p1,selector2);
+
+		// c[6, 0-15]
+		CVT_MULRND_CVT32(c_int32_6p0,selector1);
+
+		// c[6, 16-31]
+		CVT_MULRND_CVT32(c_int32_6p1,selector2);
+
+		// c[7, 0-15]
+		CVT_MULRND_CVT32(c_int32_7p0,selector1);
+
+		// c[7, 16-31]
+		CVT_MULRND_CVT32(c_int32_7p1,selector2);
+
+		// c[8, 0-15]
+		CVT_MULRND_CVT32(c_int32_8p0,selector1);
+
+		// c[8, 16-31]
+		CVT_MULRND_CVT32(c_int32_8p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_9x32_DISABLE:
+		;
+
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[0,16-31]
+			CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[1,16-31]
+			CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[2,16-31]
+			CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[3,16-31]
+			CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[4,16-31]
+			CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[5,16-31]
+			CVT_STORE_S32_S8(c_int32_5p1,5,1);
+
+			// c[6,0-15]
+			CVT_STORE_S32_S8(c_int32_6p0,6,0);
+
+			// c[6,16-31]
+			CVT_STORE_S32_S8(c_int32_6p1,6,1);
+
+			// c[7,0-15]
+			CVT_STORE_S32_S8(c_int32_7p0,7,0);
+
+			// c[7,16-31]
+			CVT_STORE_S32_S8(c_int32_7p1,7,1);
+
+			// c[8,0-15]
+			CVT_STORE_S32_S8(c_int32_8p0,8,0);
+
+			// c[8,16-31]
+			CVT_STORE_S32_S8(c_int32_8p1,8,1);
+		}
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
+
+			// c[6,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 6 ) ) + ( 0*16 ), c_int32_6p0 );
+
+			// c[6, 16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 6 ) ) + ( 1*16 ), c_int32_6p1 );
+
+			// c[7,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 7 ) ) + ( 0*16 ), c_int32_7p0 );
+
+			// c[7,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 7 ) ) + ( 1*16 ), c_int32_7p1 );
+
+			// c[8,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 8 ) ) + ( 0*16 ), c_int32_8p0 );
+
+			// c[8,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 8 ) ) + ( 1*16 ), c_int32_8p1 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		lpgemm_rowvar_u8s8s32o32_6x32
+		(
+		  m_partial_pieces, k0,
+		  a, rs_a, cs_a, ps_a,
+		  b, rs_b, cs_b,
+		  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+		  alpha, beta,
+		  post_ops_list, post_ops_attr
+		);
+	}
+}
+
+#endif
diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c
new file mode 100644
index 0000000000..d5f86338a6
--- /dev/null
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_n_fringe_amd512vnni.c
@@ -0,0 +1,2865 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <immintrin.h>
+#include <string.h>
+#include "blis.h"
+
+#ifdef BLIS_ADDON_LPGEMM
+
+#include "lpgemm_s32_kern_macros.h"
+#include "lpgemm_s32_memcpy_macros.h"
+
+// 6xlt16 int8o32 fringe kernel
+LPGEMM_N_LT_NR0_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6xlt16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6xLT16_DISABLE,
+						  &&POST_OPS_BIAS_6xLT16,
+						  &&POST_OPS_RELU_6xLT16,
+						  &&POST_OPS_RELU_SCALE_6xLT16,
+						  &&POST_OPS_GELU_TANH_6xLT16,
+						  &&POST_OPS_GELU_ERF_6xLT16,
+						  &&POST_OPS_CLIP_6xLT16,
+						  &&POST_OPS_DOWNSCALE_6xLT16
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+	__m512i a_int32_2;
+	__m512i a_int32_3;
+	__m512i a_int32_4;
+	__m512i a_int32_5;
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 16 extended elements each from B to 1 ZMM
+			// registers. It is to be noted that the B matrix is packed for use
+			// in vnni instructions and each load to ZMM register will have 4
+			// elements along k direction and 16 elements across n directions,
+			// so 4x16 elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_2 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_3 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_4 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_5 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_3 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_4 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_5 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_0p0, 0, 0, \
+								selector1, selector2 );
+
+				// c[1,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_1p0, 1, 0, \
+								selector1, selector2 );
+
+				// c[2,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_2p0, 2, 0, \
+								selector1, selector2 );
+
+				// c[3,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_3p0, 3, 0, \
+								selector1, selector2 );
+
+				// c[4,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_4p0, 4, 0, \
+								selector1, selector2 );
+
+				// c[5,0-15]
+				S8_S32_BETA_OP_NLT16F_MASK( load_mask, c_int32_5p0, 5, 0, \
+								selector1, selector2 );
+			}
+			else
+			{
+				__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+				// c[0,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_0p0, ir, 0, 0, \
+								selector1, selector2);
+
+				// c[1,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_1p0, ir, 1, 0, \
+								selector1, selector2);
+
+				// c[2,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_2p0, ir, 2, 0, \
+								selector1, selector2);
+
+				// c[3,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_3p0, ir, 3, 0, \
+								selector1, selector2);
+
+				// c[4,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_4p0, ir, 4, 0, \
+								selector1, selector2);
+
+				// c[5,0-15]
+				S32_S32_BETA_OP_NLT16F_MASK(load_mask, c_int32_5p0, ir, 5, 0, \
+								selector1, selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6xLT16:
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( int32_t* )post_ops_list_temp->op_args1 +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6xLT16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6xLT16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6xLT16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6xLT16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6xLT16:
+		{
+			// Typecast without data modification, safe operation.
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+			selector1 = _mm512_maskz_loadu_epi32
+			(
+			  load_mask,
+			  ( ( float* )post_ops_list_temp->scale_factor +
+				post_ops_attr.post_op_c_j )
+			);
+
+			// c[0, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_0p0,selector1);
+
+			// c[1, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_1p0,selector1);
+
+			// c[2, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_2p0,selector1);
+
+			// c[3, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_3p0,selector1);
+
+			// c[4, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_4p0,selector1);
+
+			// c[5, 0-15]
+			CVT_MULRND_CVT32_LT16(c_int32_5p0,selector1);
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_6xLT16_DISABLE:
+		;
+
+		// Store the results.
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			__mmask16 mask_all1 = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+		}
+		else
+		{
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - n0_rem ) );
+
+			// Store the results.
+			// c[0,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 0 ) ), load_mask, c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 1 ) ), load_mask, c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 2 ) ), load_mask, c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 3 ) ), load_mask, c_int32_3p0 );
+
+			// c[4,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 4 ) ), load_mask, c_int32_4p0 );
+
+			// c[5,0-15]
+			_mm512_mask_storeu_epi32( c + ( rs_c * ( ir + 5 ) ), load_mask, c_int32_5p0 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_u8s8s32o32_5xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_u8s8s32o32_4xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_u8s8s32o32_3xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_u8s8s32o32_2xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_u8s8s32o32_1xlt16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta, n0_rem,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+// 6x16 int8o32 fringe kernel
+LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x16)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x16_DISABLE,
+						  &&POST_OPS_BIAS_6x16,
+						  &&POST_OPS_RELU_6x16,
+						  &&POST_OPS_RELU_SCALE_6x16,
+						  &&POST_OPS_GELU_TANH_6x16,
+						  &&POST_OPS_GELU_ERF_6x16,
+						  &&POST_OPS_CLIP_6x16,
+						  &&POST_OPS_DOWNSCALE_6x16
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+	__m512i a_int32_2;
+	__m512i a_int32_3;
+	__m512i a_int32_4;
+	__m512i a_int32_5;
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 16 elements each from B to 1 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_2 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_3 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_4 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_5 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-15] = a[0,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-15] = a[1,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-15] = a[2,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_3 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-15] = a[3,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_4 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-15] = a[4,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_5 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-15] = a[5,kr:kr+4]*b[kr:kr+4,0-15]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15]
+				S8_S32_BETA_OP(c_int32_0p0,ir,0,0,selector1,selector2);
+
+				// c[1:0-15]
+				S8_S32_BETA_OP(c_int32_1p0,ir,1,0,selector1,selector2);
+
+				// c[2:0-15]
+				S8_S32_BETA_OP(c_int32_2p0,ir,2,0,selector1,selector2);
+
+				// c[3:0-15]
+				S8_S32_BETA_OP(c_int32_3p0,ir,3,0,selector1,selector2);
+
+				// c[4:0-15]
+				S8_S32_BETA_OP(c_int32_4p0,ir,4,0,selector1,selector2);
+
+				// c[5:0-15]
+				S8_S32_BETA_OP(c_int32_5p0,ir,5,0,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15]
+				S32_S32_BETA_OP(c_int32_0p0,ir,0,0,selector1,selector2);
+
+				// c[1:0-15]
+				S32_S32_BETA_OP(c_int32_1p0,ir,1,0,selector1,selector2);
+
+				// c[2:0-15]
+				S32_S32_BETA_OP(c_int32_2p0,ir,2,0,selector1,selector2);
+
+				// c[3:0-15]
+				S32_S32_BETA_OP(c_int32_3p0,ir,3,0,selector1,selector2);
+
+				// c[4:0-15]
+				S32_S32_BETA_OP(c_int32_4p0,ir,4,0,selector1,selector2);
+
+				// c[5:0-15]
+				S32_S32_BETA_OP(c_int32_5p0,ir,5,0,selector1,selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x16:
+		{
+			selector1 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x16:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x16:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x16:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x16:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x16:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x16:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[5, 0-15]
+		CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x16_DISABLE:
+		;
+
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+		}
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_u8s8s32o32_5x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_u8s8s32o32_4x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_u8s8s32o32_3x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_u8s8s32o32_2x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_u8s8s32o32_1x16
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+// 6x32 int8o32 fringe kernel
+LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x32)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x32_DISABLE,
+						  &&POST_OPS_BIAS_6x32,
+						  &&POST_OPS_RELU_6x32,
+						  &&POST_OPS_RELU_SCALE_6x32,
+						  &&POST_OPS_GELU_TANH_6x32,
+						  &&POST_OPS_GELU_ERF_6x32,
+						  &&POST_OPS_CLIP_6x32,
+						  &&POST_OPS_DOWNSCALE_6x32
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+	__m512i a_int32_1;
+	__m512i a_int32_2;
+	__m512i a_int32_3;
+	__m512i a_int32_4;
+	__m512i a_int32_5;
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		_mm_prefetch( b, _MM_HINT_T0 );
+		_mm_prefetch( a + ( MR * ps_a ) + ( 0 * 16 ), _MM_HINT_T1 );
+
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+		__m512i c_int32_0p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+		__m512i c_int32_1p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+		__m512i c_int32_2p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+		__m512i c_int32_3p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+		__m512i c_int32_4p1 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+		__m512i c_int32_5p1 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 32 elements each from B to 2 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_1 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_2 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_3 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_4 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_5 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_2, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_3, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_4, b1 );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-31] = a[5,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_5, b1 );
+		}
+		// Handle k remainder.
+		__asm__(".p2align 6\n");
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-31] = a[0,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_1 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-31] = a[1,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_1, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_1, b1 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_2 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-31] = a[2,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_2, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_2, b1 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_3 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-31] = a[3,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_3, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_3, b1 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_4 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-31] = a[4,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_4, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_4, b1 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_5 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-31] = a[5,kr:kr+4]*b[kr:kr+4,0-31]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_5, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_5, b1 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+			c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+			c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+			c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+			c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+			c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+			c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15,16-31]
+				S8_S32_BETA_OP2(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31]
+				S8_S32_BETA_OP2(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31]
+				S8_S32_BETA_OP2(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31]
+				S8_S32_BETA_OP2(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31]
+				S8_S32_BETA_OP2(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31]
+				S8_S32_BETA_OP2(ir,5,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15,16-31]
+				S32_S32_BETA_OP2(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31]
+				S32_S32_BETA_OP2(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31]
+				S32_S32_BETA_OP2(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31]
+				S32_S32_BETA_OP2(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31]
+				S32_S32_BETA_OP2(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31]
+				S32_S32_BETA_OP2(ir,5,selector1,selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x32:
+		{
+			selector1 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[1, 16-31]
+			c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[2, 16-31]
+			c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[3, 16-31]
+			c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[4, 16-31]
+			c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			// c[5, 16-31]
+			c_int32_5p1 = _mm512_add_epi32( selector2, c_int32_5p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x32:
+		{
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[1,16-31]
+			c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[2,16-31]
+			c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[3,16-31]
+			c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[4,16-31]
+			c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			// c[5,16-31]
+			c_int32_5p1 = _mm512_max_epi32( selector1, c_int32_5p1 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x32:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p1)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x32:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_5p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x32:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_5p1, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x32:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_S32_AVX512(c_int32_5p1, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x32:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[5, 0-15]
+		CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+		// c[5, 16-31]
+		CVT_MULRND_CVT32(c_int32_5p1,selector2);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x32_DISABLE:
+		;
+
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[0,16-31]
+			CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[1,16-31]
+			CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[2,16-31]
+			CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[3,16-31]
+			CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[4,16-31]
+			CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[5,16-31]
+			CVT_STORE_S32_S8(c_int32_5p1,5,1);
+		}
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_u8s8s32o32_5x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_u8s8s32o32_4x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_u8s8s32o32_3x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_u8s8s32o32_2x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_u8s8s32o32_1x32
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+
+// 6x48 int8o32 fringe kernel
+LPGEMM_N_FRINGE_KERN(uint8_t,int8_t,int32_t,u8s8s32o32_6x48)
+{
+	static void* post_ops_labels[] =
+						{
+						  &&POST_OPS_6x48_DISABLE,
+						  &&POST_OPS_BIAS_6x48,
+						  &&POST_OPS_RELU_6x48,
+						  &&POST_OPS_RELU_SCALE_6x48,
+						  &&POST_OPS_GELU_TANH_6x48,
+						  &&POST_OPS_GELU_ERF_6x48,
+						  &&POST_OPS_CLIP_6x48,
+						  &&POST_OPS_DOWNSCALE_6x48
+						};
+	dim_t MR = 6;
+	dim_t m_full_pieces = m0 / MR;
+	dim_t m_full_pieces_loop_limit = m_full_pieces * MR;
+	dim_t m_partial_pieces = m0 % MR;
+
+	dim_t k_full_pieces = k0 / 4;
+	dim_t k_partial_pieces = k0 % 4;
+
+	// B matrix storage.
+	__m512i b0;
+	__m512i b1;
+	__m512i b2;
+
+	// A matrix storage.
+	__m512i a_int32_0;
+
+	for ( dim_t ir = 0; ir < m_full_pieces_loop_limit; ir += MR )
+	{
+		// Registers to use for accumulating C.
+		__m512i c_int32_0p0 = _mm512_setzero_epi32();
+		__m512i c_int32_0p1 = _mm512_setzero_epi32();
+		__m512i c_int32_0p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_1p0 = _mm512_setzero_epi32();
+		__m512i c_int32_1p1 = _mm512_setzero_epi32();
+		__m512i c_int32_1p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_2p0 = _mm512_setzero_epi32();
+		__m512i c_int32_2p1 = _mm512_setzero_epi32();
+		__m512i c_int32_2p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_3p0 = _mm512_setzero_epi32();
+		__m512i c_int32_3p1 = _mm512_setzero_epi32();
+		__m512i c_int32_3p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_4p0 = _mm512_setzero_epi32();
+		__m512i c_int32_4p1 = _mm512_setzero_epi32();
+		__m512i c_int32_4p2 = _mm512_setzero_epi32();
+
+		__m512i c_int32_5p0 = _mm512_setzero_epi32();
+		__m512i c_int32_5p1 = _mm512_setzero_epi32();
+		__m512i c_int32_5p2 = _mm512_setzero_epi32();
+
+		for ( dim_t kr = 0; kr < k_full_pieces; kr += 1 )
+		{
+			// Load 4 rows with 48 elements each from B to 3 ZMM registers. It
+			// is to be noted that the B matrix is packed for use in vnni
+			// instructions and each load to ZMM register will have 4 elements
+			// along k direction and 16 elements across n directions, so 4x16
+			// elements to a ZMM register.
+			b0 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 1 ) );
+			b2 = _mm512_loadu_si512( b + ( rs_b * kr ) + ( cs_b * 2 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 0 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 1 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 2 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 3 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 4 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_int32_0 = _mm512_set1_epi32( *( uint32_t* )( a + ( rs_a * 5 ) + ( cs_a * kr ) ) );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-47] = a[5,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
+			c_int32_5p2 = _mm512_dpbusd_epi32( c_int32_5p2, a_int32_0, b2 );
+		}
+		// Handle k remainder.
+		if ( k_partial_pieces > 0 )
+		{
+			__m128i a_kfringe_buf;
+			__mmask16 load_mask = _cvtu32_mask16( 0xFFFF >> ( 16 - k_partial_pieces ) );
+
+			b0 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 0 ) );
+			b1 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 1 ) );
+			b2 = _mm512_loadu_si512( b + ( rs_b * k_full_pieces ) + ( cs_b * 2 ) );
+
+			// Broadcast a[0,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 0 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[0,0-47] = a[0,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_0p0 = _mm512_dpbusd_epi32( c_int32_0p0, a_int32_0, b0 );
+			c_int32_0p1 = _mm512_dpbusd_epi32( c_int32_0p1, a_int32_0, b1 );
+			c_int32_0p2 = _mm512_dpbusd_epi32( c_int32_0p2, a_int32_0, b2 );
+
+			// Broadcast a[1,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 1 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[1,0-47] = a[1,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_1p0 = _mm512_dpbusd_epi32( c_int32_1p0, a_int32_0, b0 );
+			c_int32_1p1 = _mm512_dpbusd_epi32( c_int32_1p1, a_int32_0, b1 );
+			c_int32_1p2 = _mm512_dpbusd_epi32( c_int32_1p2, a_int32_0, b2 );
+
+			// Broadcast a[2,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 2 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[2,0-47] = a[2,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_2p0 = _mm512_dpbusd_epi32( c_int32_2p0, a_int32_0, b0 );
+			c_int32_2p1 = _mm512_dpbusd_epi32( c_int32_2p1, a_int32_0, b1 );
+			c_int32_2p2 = _mm512_dpbusd_epi32( c_int32_2p2, a_int32_0, b2 );
+
+			// Broadcast a[3,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 3 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[3,0-47] = a[3,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_3p0 = _mm512_dpbusd_epi32( c_int32_3p0, a_int32_0, b0 );
+			c_int32_3p1 = _mm512_dpbusd_epi32( c_int32_3p1, a_int32_0, b1 );
+			c_int32_3p2 = _mm512_dpbusd_epi32( c_int32_3p2, a_int32_0, b2 );
+
+			// Broadcast a[4,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 4 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[4,0-47] = a[4,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_4p0 = _mm512_dpbusd_epi32( c_int32_4p0, a_int32_0, b0 );
+			c_int32_4p1 = _mm512_dpbusd_epi32( c_int32_4p1, a_int32_0, b1 );
+			c_int32_4p2 = _mm512_dpbusd_epi32( c_int32_4p2, a_int32_0, b2 );
+
+			// Broadcast a[5,kr:kr+4].
+			a_kfringe_buf = _mm_maskz_loadu_epi8
+			(
+			  load_mask,
+			  ( a + ( rs_a * 5 ) + ( cs_a * k_full_pieces ) )
+			);
+			a_int32_0 = _mm512_broadcastd_epi32( a_kfringe_buf );
+
+			// Perform column direction mat-mul with k = 4.
+			// c[5,0-47] = a[5,kr:kr+4]*b[kr:kr+4,0-47]
+			c_int32_5p0 = _mm512_dpbusd_epi32( c_int32_5p0, a_int32_0, b0 );
+			c_int32_5p1 = _mm512_dpbusd_epi32( c_int32_5p1, a_int32_0, b1 );
+			c_int32_5p2 = _mm512_dpbusd_epi32( c_int32_5p2, a_int32_0, b2 );
+		}
+
+		// Load alpha and beta
+		__m512i selector1 = _mm512_set1_epi32( alpha );
+		__m512i selector2 = _mm512_set1_epi32( beta );
+
+		if ( alpha != 1 )
+		{
+			// Scale by alpha
+			c_int32_0p0 = _mm512_mullo_epi32( selector1, c_int32_0p0 );
+			c_int32_0p1 = _mm512_mullo_epi32( selector1, c_int32_0p1 );
+			c_int32_0p2 = _mm512_mullo_epi32( selector1, c_int32_0p2 );
+
+			c_int32_1p0 = _mm512_mullo_epi32( selector1, c_int32_1p0 );
+			c_int32_1p1 = _mm512_mullo_epi32( selector1, c_int32_1p1 );
+			c_int32_1p2 = _mm512_mullo_epi32( selector1, c_int32_1p2 );
+
+			c_int32_2p0 = _mm512_mullo_epi32( selector1, c_int32_2p0 );
+			c_int32_2p1 = _mm512_mullo_epi32( selector1, c_int32_2p1 );
+			c_int32_2p2 = _mm512_mullo_epi32( selector1, c_int32_2p2 );
+
+			c_int32_3p0 = _mm512_mullo_epi32( selector1, c_int32_3p0 );
+			c_int32_3p1 = _mm512_mullo_epi32( selector1, c_int32_3p1 );
+			c_int32_3p2 = _mm512_mullo_epi32( selector1, c_int32_3p2 );
+
+			c_int32_4p0 = _mm512_mullo_epi32( selector1, c_int32_4p0 );
+			c_int32_4p1 = _mm512_mullo_epi32( selector1, c_int32_4p1 );
+			c_int32_4p2 = _mm512_mullo_epi32( selector1, c_int32_4p2 );
+
+			c_int32_5p0 = _mm512_mullo_epi32( selector1, c_int32_5p0 );
+			c_int32_5p1 = _mm512_mullo_epi32( selector1, c_int32_5p1 );
+			c_int32_5p2 = _mm512_mullo_epi32( selector1, c_int32_5p2 );
+		}
+
+		// Scale C by beta.
+		if ( beta != 0 )
+		{
+			if ( ( post_ops_attr.buf_downscale != NULL ) &&
+				 ( post_ops_attr.is_first_k == TRUE ) )
+			{
+				// c[0:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31,32-47]
+				S8_S32_BETA_OP3(ir,5,selector1,selector2);
+			}
+			else
+			{
+				// c[0:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,0,selector1,selector2);
+
+				// c[1:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,1,selector1,selector2);
+
+				// c[2:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,2,selector1,selector2);
+
+				// c[3:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,3,selector1,selector2);
+
+				// c[4:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,4,selector1,selector2);
+
+				// c[5:0-15,16-31,32-47]
+				S32_S32_BETA_OP3(ir,5,selector1,selector2);
+			}
+		}
+
+        // Post Ops
+		lpgemm_post_op* post_ops_list_temp = post_ops_list;
+		POST_OP_LABEL_LASTK_SAFE_JUMP
+POST_OPS_BIAS_6x48:
+		{
+			selector1 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+			selector2 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+			a_int32_0 =
+					_mm512_loadu_si512( ( int32_t* )post_ops_list_temp->op_args1 +
+									post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_add_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_add_epi32( selector2, c_int32_0p1 );
+
+			// c[0,32-47]
+			c_int32_0p2 = _mm512_add_epi32( a_int32_0, c_int32_0p2 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_add_epi32( selector1, c_int32_1p0 );
+
+			// c[1, 16-31]
+			c_int32_1p1 = _mm512_add_epi32( selector2, c_int32_1p1 );
+
+			// c[1,32-47]
+			c_int32_1p2 = _mm512_add_epi32( a_int32_0, c_int32_1p2 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_add_epi32( selector1, c_int32_2p0 );
+
+			// c[2, 16-31]
+			c_int32_2p1 = _mm512_add_epi32( selector2, c_int32_2p1 );
+
+			// c[2,32-47]
+			c_int32_2p2 = _mm512_add_epi32( a_int32_0, c_int32_2p2 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_add_epi32( selector1, c_int32_3p0 );
+
+			// c[3, 16-31]
+			c_int32_3p1 = _mm512_add_epi32( selector2, c_int32_3p1 );
+
+			// c[3,32-47]
+			c_int32_3p2 = _mm512_add_epi32( a_int32_0, c_int32_3p2 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_add_epi32( selector1, c_int32_4p0 );
+
+			// c[4, 16-31]
+			c_int32_4p1 = _mm512_add_epi32( selector2, c_int32_4p1 );
+
+			// c[4,32-47]
+			c_int32_4p2 = _mm512_add_epi32( a_int32_0, c_int32_4p2 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_add_epi32( selector1, c_int32_5p0 );
+
+			// c[5, 16-31]
+			c_int32_5p1 = _mm512_add_epi32( selector2, c_int32_5p1 );
+
+			// c[5,32-47]
+			c_int32_5p2 = _mm512_add_epi32( a_int32_0, c_int32_5p2 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_6x48:
+		{
+			//printf("relu\n");
+			selector1 = _mm512_setzero_epi32();
+
+			// c[0,0-15]
+			c_int32_0p0 = _mm512_max_epi32( selector1, c_int32_0p0 );
+
+			// c[0, 16-31]
+			c_int32_0p1 = _mm512_max_epi32( selector1, c_int32_0p1 );
+
+			// c[0,32-47]
+			c_int32_0p2 = _mm512_max_epi32( selector1, c_int32_0p2 );
+
+			// c[1,0-15]
+			c_int32_1p0 = _mm512_max_epi32( selector1, c_int32_1p0 );
+
+			// c[1,16-31]
+			c_int32_1p1 = _mm512_max_epi32( selector1, c_int32_1p1 );
+
+			// c[1,32-47]
+			c_int32_1p2 = _mm512_max_epi32( selector1, c_int32_1p2 );
+
+			// c[2,0-15]
+			c_int32_2p0 = _mm512_max_epi32( selector1, c_int32_2p0 );
+
+			// c[2,16-31]
+			c_int32_2p1 = _mm512_max_epi32( selector1, c_int32_2p1 );
+
+			// c[2,32-47]
+			c_int32_2p2 = _mm512_max_epi32( selector1, c_int32_2p2 );
+
+			// c[3,0-15]
+			c_int32_3p0 = _mm512_max_epi32( selector1, c_int32_3p0 );
+
+			// c[3,16-31]
+			c_int32_3p1 = _mm512_max_epi32( selector1, c_int32_3p1 );
+
+			// c[3,32-47]
+			c_int32_3p2 = _mm512_max_epi32( selector1, c_int32_3p2 );
+
+			// c[4,0-15]
+			c_int32_4p0 = _mm512_max_epi32( selector1, c_int32_4p0 );
+
+			// c[4,16-31]
+			c_int32_4p1 = _mm512_max_epi32( selector1, c_int32_4p1 );
+
+			// c[4,32-47]
+			c_int32_4p2 = _mm512_max_epi32( selector1, c_int32_4p2 );
+
+			// c[5,0-15]
+			c_int32_5p0 = _mm512_max_epi32( selector1, c_int32_5p0 );
+
+			// c[5,16-31]
+			c_int32_5p1 = _mm512_max_epi32( selector1, c_int32_5p1 );
+
+			// c[5,32-47]
+			c_int32_5p2 = _mm512_max_epi32( selector1, c_int32_5p2 );
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_RELU_SCALE_6x48:
+		{
+			selector1 = _mm512_setzero_epi32();
+			selector2 =
+				_mm512_set1_epi32( *( ( int32_t* )post_ops_list_temp->op_args2 ) );
+
+			__mmask16 relu_cmp_mask;
+
+			// c[0, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p0)
+
+			// c[0, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p1)
+
+			// c[0, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_0p2)
+
+			// c[1, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p0)
+
+			// c[1, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p1)
+
+			// c[1, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_1p2)
+
+			// c[2, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p0)
+
+			// c[2, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p1)
+
+			// c[2, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_2p2)
+
+			// c[3, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p0)
+
+			// c[3, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p1)
+
+			// c[3, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_3p2)
+
+			// c[4, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p0)
+
+			// c[4, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p1)
+
+			// c[4, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_4p2)
+
+			// c[5, 0-15]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p0)
+
+			// c[5, 16-31]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p1)
+
+			// c[5, 32-47]
+			RELU_SCALE_OP_S32_AVX512(c_int32_5p2)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_TANH_6x48:
+		{
+			__m512 dn, z, x, r2, r, y, x_tanh;
+			__m512i q;
+
+			// c[0, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_0p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_0p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[0, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_0p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_1p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_1p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[1, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_1p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_2p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_2p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[2, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_2p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_3p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_3p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[3, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_3p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_4p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_4p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[4, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_4p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 0-15]
+			GELU_TANH_S32_AVX512(c_int32_5p0, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 16-31]
+			GELU_TANH_S32_AVX512(c_int32_5p1, y, r, r2, x, z, dn, x_tanh, q)
+
+			// c[5, 32-47]
+			GELU_TANH_S32_AVX512(c_int32_5p2, y, r, r2, x, z, dn, x_tanh, q)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_GELU_ERF_6x48:
+		{
+			__m512 x, r, y, x_erf;
+
+			// c[0, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_0p0, y, r, x, x_erf)
+
+			// c[0, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_0p1, y, r, x, x_erf)
+
+			// c[0, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_0p2, y, r, x, x_erf)
+
+			// c[1, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_1p0, y, r, x, x_erf)
+
+			// c[1, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_1p1, y, r, x, x_erf)
+
+			// c[1, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_1p2, y, r, x, x_erf)
+
+			// c[2, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_2p0, y, r, x, x_erf)
+
+			// c[2, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_2p1, y, r, x, x_erf)
+
+			// c[2, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_2p2, y, r, x, x_erf)
+
+			// c[3, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_3p0, y, r, x, x_erf)
+
+			// c[3, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_3p1, y, r, x, x_erf)
+
+			// c[3, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_3p2, y, r, x, x_erf)
+
+			// c[4, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_4p0, y, r, x, x_erf)
+
+			// c[4, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_4p1, y, r, x, x_erf)
+
+			// c[4, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_4p2, y, r, x, x_erf)
+
+			// c[5, 0-15]
+			GELU_ERF_S32_AVX512(c_int32_5p0, y, r, x, x_erf)
+
+			// c[5, 16-31]
+			GELU_ERF_S32_AVX512(c_int32_5p1, y, r, x, x_erf)
+
+			// c[5, 32-47]
+			GELU_ERF_S32_AVX512(c_int32_5p2, y, r, x, x_erf)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+POST_OPS_CLIP_6x48:
+		{
+			__m512i min = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args2 );
+			__m512i max = _mm512_set1_epi32( *( int32_t* )post_ops_list_temp->op_args3 );
+
+			// c[0, 0-15]
+			CLIP_S32_AVX512(c_int32_0p0, min, max)
+
+			// c[0, 16-31]
+			CLIP_S32_AVX512(c_int32_0p1, min, max)
+
+			// c[0, 32-47]
+			CLIP_S32_AVX512(c_int32_0p2, min, max)
+
+			// c[1, 0-15]
+			CLIP_S32_AVX512(c_int32_1p0, min, max)
+
+			// c[1, 16-31]
+			CLIP_S32_AVX512(c_int32_1p1, min, max)
+
+			// c[1, 32-47]
+			CLIP_S32_AVX512(c_int32_1p2, min, max)
+
+			// c[2, 0-15]
+			CLIP_S32_AVX512(c_int32_2p0, min, max)
+
+			// c[2, 16-31]
+			CLIP_S32_AVX512(c_int32_2p1, min, max)
+
+			// c[2, 32-47]
+			CLIP_S32_AVX512(c_int32_2p2, min, max)
+
+			// c[3, 0-15]
+			CLIP_S32_AVX512(c_int32_3p0, min, max)
+
+			// c[3, 16-31]
+			CLIP_S32_AVX512(c_int32_3p1, min, max)
+
+			// c[3, 32-47]
+			CLIP_S32_AVX512(c_int32_3p2, min, max)
+
+			// c[4, 0-15]
+			CLIP_S32_AVX512(c_int32_4p0, min, max)
+
+			// c[4, 16-31]
+			CLIP_S32_AVX512(c_int32_4p1, min, max)
+
+			// c[4, 32-47]
+			CLIP_S32_AVX512(c_int32_4p2, min, max)
+
+			// c[5, 0-15]
+			CLIP_S32_AVX512(c_int32_5p0, min, max)
+
+			// c[5, 16-31]
+			CLIP_S32_AVX512(c_int32_5p1, min, max)
+
+			// c[5, 32-47]
+			CLIP_S32_AVX512(c_int32_5p2, min, max)
+
+			POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+		}
+
+POST_OPS_DOWNSCALE_6x48:
+	{
+		selector1 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 0 * 16 ) );
+		selector2 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 1 * 16 ) );
+		a_int32_0 =
+			_mm512_loadu_si512( ( float* )post_ops_list_temp->scale_factor +
+							post_ops_attr.post_op_c_j + ( 2 * 16 ) );
+
+		// c[0, 0-15]
+		CVT_MULRND_CVT32(c_int32_0p0,selector1);
+
+		// c[0, 16-31]
+		CVT_MULRND_CVT32(c_int32_0p1,selector2);
+
+		// c[0, 32-47]
+		CVT_MULRND_CVT32(c_int32_0p2,a_int32_0);
+
+		// c[1, 0-15]
+		CVT_MULRND_CVT32(c_int32_1p0,selector1);
+
+		// c[1, 16-31]
+		CVT_MULRND_CVT32(c_int32_1p1,selector2);
+
+		// c[1, 32-47]
+		CVT_MULRND_CVT32(c_int32_1p2,a_int32_0);
+
+		// c[2, 0-15]
+		CVT_MULRND_CVT32(c_int32_2p0,selector1);
+
+		// c[2, 16-31]
+		CVT_MULRND_CVT32(c_int32_2p1,selector2);
+
+		// c[2, 32-47]
+		CVT_MULRND_CVT32(c_int32_2p2,a_int32_0);
+
+		// c[3, 0-15]
+		CVT_MULRND_CVT32(c_int32_3p0,selector1);
+
+		// c[3, 16-31]
+		CVT_MULRND_CVT32(c_int32_3p1,selector2);
+
+		// c[3, 32-47]
+		CVT_MULRND_CVT32(c_int32_3p2,a_int32_0);
+
+		// c[4, 0-15]
+		CVT_MULRND_CVT32(c_int32_4p0,selector1);
+
+		// c[4, 16-31]
+		CVT_MULRND_CVT32(c_int32_4p1,selector2);
+
+		// c[4, 32-47]
+		CVT_MULRND_CVT32(c_int32_4p2,a_int32_0);
+
+		// c[5, 0-15]
+		CVT_MULRND_CVT32(c_int32_5p0,selector1);
+
+		// c[5, 16-31]
+		CVT_MULRND_CVT32(c_int32_5p1,selector2);
+
+		// c[5, 32-47]
+		CVT_MULRND_CVT32(c_int32_5p2,a_int32_0);
+
+		POST_OP_LABEL_LASTK_SAFE_JUMP_WITH_NEXT_PTR
+	}
+POST_OPS_6x48_DISABLE:
+		;
+
+		if ( ( post_ops_attr.buf_downscale != NULL ) && ( post_ops_attr.is_last_k == TRUE ) )
+		{
+			// Generate a mask16 of all 1's.
+			selector1 = _mm512_setzero_epi32();
+			selector2 = _mm512_set1_epi32( 10 );
+			__mmask16 mask_all1 = _mm512_cmplt_epi32_mask( selector1, selector2 );
+
+			// Store the results in downscaled type (int8 instead of int32).
+			// c[0,0-15]
+			CVT_STORE_S32_S8(c_int32_0p0,0,0);
+
+			// c[0,16-31]
+			CVT_STORE_S32_S8(c_int32_0p1,0,1);
+
+			// c[0,32-47]
+			CVT_STORE_S32_S8(c_int32_0p2,0,2);
+
+			// c[1,0-15]
+			CVT_STORE_S32_S8(c_int32_1p0,1,0);
+
+			// c[1,16-31]
+			CVT_STORE_S32_S8(c_int32_1p1,1,1);
+
+			// c[1,32-47]
+			CVT_STORE_S32_S8(c_int32_1p2,1,2);
+
+			// c[2,0-15]
+			CVT_STORE_S32_S8(c_int32_2p0,2,0);
+
+			// c[2,16-31]
+			CVT_STORE_S32_S8(c_int32_2p1,2,1);
+
+			// c[2,32-47]
+			CVT_STORE_S32_S8(c_int32_2p2,2,2);
+
+			// c[3,0-15]
+			CVT_STORE_S32_S8(c_int32_3p0,3,0);
+
+			// c[3,16-31]
+			CVT_STORE_S32_S8(c_int32_3p1,3,1);
+
+			// c[3,32-47]
+			CVT_STORE_S32_S8(c_int32_3p2,3,2);
+
+			// c[4,0-15]
+			CVT_STORE_S32_S8(c_int32_4p0,4,0);
+
+			// c[4,16-31]
+			CVT_STORE_S32_S8(c_int32_4p1,4,1);
+
+			// c[4,32-47]
+			CVT_STORE_S32_S8(c_int32_4p2,4,2);
+
+			// c[5,0-15]
+			CVT_STORE_S32_S8(c_int32_5p0,5,0);
+
+			// c[5,16-31]
+			CVT_STORE_S32_S8(c_int32_5p1,5,1);
+
+			// c[5,32-47]
+			CVT_STORE_S32_S8(c_int32_5p2,5,2);
+		}
+		else
+		{
+			// Store the results.
+			// c[0,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 0*16 ), c_int32_0p0 );
+
+			// c[0, 16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 1*16 ), c_int32_0p1 );
+
+			// c[0,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 0 ) ) + ( 2*16 ), c_int32_0p2 );
+
+			// c[1,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 0*16 ), c_int32_1p0 );
+
+			// c[1,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 1*16 ), c_int32_1p1 );
+
+			// c[1,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 1 ) ) + ( 2*16 ), c_int32_1p2 );
+
+			// c[2,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 0*16 ), c_int32_2p0 );
+
+			// c[2,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 1*16 ), c_int32_2p1 );
+
+			// c[2,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 2 ) ) + ( 2*16 ), c_int32_2p2 );
+
+			// c[3,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 0*16 ), c_int32_3p0 );
+
+			// c[3,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 1*16 ), c_int32_3p1 );
+
+			// c[3,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 3 ) ) + ( 2*16 ), c_int32_3p2 );
+
+			// c[4,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 0*16 ), c_int32_4p0 );
+
+			// c[4,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 1*16 ), c_int32_4p1 );
+
+			// c[4,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 4 ) ) + ( 2*16 ), c_int32_4p2 );
+
+			// c[5,0-15]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 0*16 ), c_int32_5p0 );
+
+			// c[5,16-31]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 1*16 ), c_int32_5p1 );
+
+			// c[5,32-47]
+			_mm512_storeu_si512( c + ( rs_c * ( ir + 5 ) ) + ( 2*16 ), c_int32_5p2 );
+		}
+
+		a = a + ( MR * ps_a );
+		post_ops_attr.post_op_c_i += MR;
+	}
+
+	if ( m_partial_pieces > 0 )
+	{
+		if ( m_partial_pieces == 5 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 5 );
+			lpgemm_rowvar_u8s8s32o32_5x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 4 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 4 );
+			lpgemm_rowvar_u8s8s32o32_4x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 3 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 3 );
+			lpgemm_rowvar_u8s8s32o32_3x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 2 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 2 );
+			lpgemm_rowvar_u8s8s32o32_2x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+		else if ( m_partial_pieces == 1 )
+		{
+			dim_t cs_a_use = ( cs_a == 4 ) ? 4 : ( ( cs_a / 6 ) * 1 );
+			lpgemm_rowvar_u8s8s32o32_1x48
+			(
+			  k0,
+			  a, rs_a, cs_a_use,
+			  b, rs_b, cs_b,
+			  ( c + ( rs_c * m_full_pieces_loop_limit ) ), rs_c,
+			  alpha, beta,
+			  post_ops_list, post_ops_attr
+			);
+		}
+	}
+}
+#endif
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c
similarity index 83%
rename from addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa_amd512vnni.c
rename to kernels/zen4/lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c
index 601b8a3eff..32cd7aef3d 100644
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packa_amd512vnni.c
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_packa_amd512vnni.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,24 +33,13 @@
 */
 
 #include <immintrin.h>
-
 #include "blis.h"
-#include "lpgemm_packa.h"
+
+#ifdef BLIS_ADDON_LPGEMM
 
 #define MR 6
 #define NR 64
 
-void get_packa_k64_u8s8s32o32_strides
-     (
-       dim_t* rs_a,
-       dim_t* cs_a
-     )
-{
-	*rs_a = 4;
-	*cs_a = 24;
-}
-
-#ifdef BLIS_KERNELS_ZEN4
 void packa_m5_k64_u8s8s32o32
      (
        uint8_t*       pack_a_buffer_u8s8s32o32,
@@ -143,12 +132,12 @@ void packa_k64_u8s8s32o32
 		for ( dim_t kr = 0; kr < KC; kr += NR )
 		{
 			// Rearrange for vpdpbusd, read 6 rows from A with 64 elements in each row.
-			a0 = _mm512_loadu_epi8( a + ( lda * ( ic + 0 ) ) + kr );
-			b0 = _mm512_loadu_epi8( a + ( lda * ( ic + 1 ) ) + kr );
-			c0 = _mm512_loadu_epi8( a + ( lda * ( ic + 2 ) ) + kr );
-			d0 = _mm512_loadu_epi8( a + ( lda * ( ic + 3 ) ) + kr );
-			e0 = _mm512_loadu_epi8( a + ( lda * ( ic + 4 ) ) + kr );
-			f0 = _mm512_loadu_epi8( a + ( lda * ( ic + 5 ) ) + kr );
+			a0 = _mm512_loadu_si512( a + ( lda * ( ic + 0 ) ) + kr );
+			b0 = _mm512_loadu_si512( a + ( lda * ( ic + 1 ) ) + kr );
+			c0 = _mm512_loadu_si512( a + ( lda * ( ic + 2 ) ) + kr );
+			d0 = _mm512_loadu_si512( a + ( lda * ( ic + 3 ) ) + kr );
+			e0 = _mm512_loadu_si512( a + ( lda * ( ic + 4 ) ) + kr );
+			f0 = _mm512_loadu_si512( a + ( lda * ( ic + 5 ) ) + kr );
 
 			a01 = _mm512_unpacklo_epi32( a0, b0 );
 			a0 = _mm512_unpackhi_epi32( a0, b0 );
@@ -181,12 +170,17 @@ void packa_k64_u8s8s32o32
 			d0 = _mm512_permutex2var_epi64( a0, selector5, e01 ); // 2nd 64
 			a0 = _mm512_permutex2var_epi64( a0, selector6, e0 ); // 2nd 32
 
-			_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 0 ) ) ), b0 );
-			_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 64 ) ) ) , a01 );
-			_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 96 ) ) ), d0 );
+			_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 0 ) ) ), b0 );
+			_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 64 ) ) ) , a01 );
+			_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 96 ) ) ), d0 );
 			// Last piece
 			last_piece = _mm512_castsi512_si256( a0 );
-			_mm256_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 160 ) ) ), last_piece );
+			_mm256_mask_storeu_epi64
+			(
+			  pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 160 ) ) ),
+			  0xFF,
+			  last_piece
+			);
 
 			// Second half
 			b0 = _mm512_permutex2var_epi64( c01, selector7, e01 ); // 3rd 64
@@ -194,12 +188,17 @@ void packa_k64_u8s8s32o32
 			d0 = _mm512_permutex2var_epi64( c0, selector9, e01 ); // 4th 64
 			c0 = _mm512_permutex2var_epi64( c0, selector10, e0 ); // 4th 32
 
-			_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 192 ) ) ), b0 );
-			_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 256 ) ) ) , c01 );
-			_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 288 ) ) ), d0 );
+			_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 192 ) ) ), b0 );
+			_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 256 ) ) ) , c01 );
+			_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 288 ) ) ), d0 );
 			// Last piece
 			last_piece = _mm512_castsi512_si256( c0 );
-			_mm256_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 352 ) ) ), last_piece );
+			_mm256_mask_storeu_epi64
+			(
+			  pack_a_buffer_u8s8s32o32 + ( ( ic * KC ) + ( ( kr * MR ) + ( 352 ) ) ),
+			  0xFF,
+			  last_piece
+			);
 		}
 		//TODO: Handle kc < 64 case, 48,32,16
 	}
@@ -291,11 +290,11 @@ void packa_m5_k64_u8s8s32o32
 	for ( dim_t kr = 0; kr < KC; kr += NR )
 	{
 		// Rearrange for vpdpbusd, read 5 rows from A with 64 elements in each row.
-		a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr );
-		b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr );
-		c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr );
-		d0 = _mm512_loadu_epi8( a + ( lda * 3 ) + kr );
-		e0 = _mm512_loadu_epi8( a + ( lda * 4 ) + kr );
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+		b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr );
+		c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr );
+		d0 = _mm512_loadu_si512( a + ( lda * 3 ) + kr );
+		e0 = _mm512_loadu_si512( a + ( lda * 4 ) + kr );
 
 		a01 = _mm512_unpacklo_epi32( a0, b0 );
 		a0 = _mm512_unpackhi_epi32( a0, b0 );
@@ -325,12 +324,17 @@ void packa_m5_k64_u8s8s32o32
 		d0 = _mm512_permutex2var_epi32( a0, selector5, e0 );
 		a0 = _mm512_permutex2var_epi32( a0, selector6, e0 );
 
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 0 ) ), b0 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 64 ) ) , a01 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 80 ) ), d0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 0 ) ), b0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 64 ) ) , a01 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 80 ) ), d0 );
 		// Last piece
 		last_piece = _mm512_castsi512_si128( a0 );
-		_mm_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 144 ) ), last_piece );
+		_mm_mask_storeu_epi64
+		(
+		  pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 144 ) ),
+		  0xFF,
+		  last_piece
+		);
 
 		// Second half
 		b0 = _mm512_permutex2var_epi32( c01, selector7, e0 );
@@ -338,12 +342,17 @@ void packa_m5_k64_u8s8s32o32
 		d0 = _mm512_permutex2var_epi32( c0, selector9, e0 );
 		c0 = _mm512_permutex2var_epi32( c0, selector10, e0 );
 
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 160 ) ), b0 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 224 ) ) , c01 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 240 ) ), d0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 160 ) ), b0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 224 ) ) , c01 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 240 ) ), d0 );
 		// Last piece
 		last_piece = _mm512_castsi512_si128( c0 );
-		_mm_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 304 ) ), last_piece );
+		_mm_mask_storeu_epi64
+		(
+		  pack_a_buffer_u8s8s32o32 + ( ( kr * 5 ) + ( 304 ) ),
+		  0xFF,
+		  last_piece
+		);
 	}
 }
 
@@ -373,10 +382,10 @@ void packa_m4_k64_u8s8s32o32
 	for ( dim_t kr = 0; kr < KC; kr += NR )
 	{
 		// Rearrange for vpdpbusd, read 4 rows from A with 64 elements in each row.
-		a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr );
-		b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr );
-		c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr );
-		d0 = _mm512_loadu_epi8( a + ( lda * 3 ) + kr );
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+		b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr );
+		c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr );
+		d0 = _mm512_loadu_si512( a + ( lda * 3 ) + kr );
 
 		a01 = _mm512_unpacklo_epi32( a0, b0 );
 		a0 = _mm512_unpackhi_epi32( a0, b0 );
@@ -400,10 +409,10 @@ void packa_m4_k64_u8s8s32o32
 		a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // a[1]
 		c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // a[3]
 
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 0 ) ), a01 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 64 ) ) , a0 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 128 ) ), c01 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 192 ) ), c0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 0 ) ), a01 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 64 ) ) , a0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 128 ) ), c01 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 4 ) + ( 192 ) ), c0 );
 	}
 }
 
@@ -438,9 +447,9 @@ void packa_m3_k64_u8s8s32o32
 	for ( dim_t kr = 0; kr < KC; kr += NR )
 	{
 		// Rearrange for vpdpbusd, read 3 rows from A with 64 elements in each row.
-		a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr );
-		b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr );
-		c0 = _mm512_loadu_epi8( a + ( lda * 2 ) + kr );
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+		b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr );
+		c0 = _mm512_loadu_si512( a + ( lda * 2 ) + kr );
 
 		a01 = _mm512_unpacklo_epi32( a0, b0 );
 		a0 = _mm512_unpackhi_epi32( a0, b0 );
@@ -451,16 +460,21 @@ void packa_m3_k64_u8s8s32o32
 		a0 = _mm512_permutex2var_epi32( b0, selector3, c0 );
 		b0 = _mm512_permutex2var_epi32( b0, selector4, c0 );
 
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 0 ) ), a0 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 64 ) ) , b0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 0 ) ), a0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 64 ) ) , b0 );
 
 		a0 = _mm512_permutex2var_epi32( a01, selector5, c0 );
 		b0 = _mm512_permutex2var_epi32( a01, selector6, c0 );
 
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 96 ) ), a0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 96 ) ), a0 );
 		// Last piece
 		last_piece = _mm512_castsi512_si256( b0 );
-		_mm256_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 160 ) ), last_piece );
+		_mm256_mask_storeu_epi64
+		(
+		  pack_a_buffer_u8s8s32o32 + ( ( kr * 3 ) + ( 160 ) ),
+		  0xFF,
+		  last_piece
+		);
 	}
 }
 
@@ -485,8 +499,8 @@ void packa_m2_k64_u8s8s32o32
 	for ( dim_t kr = 0; kr < KC; kr += NR )
 	{
 		// Rearrange for vpdpbusd, read 2 rows from A with 64 elements in each row.
-		a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr );
-		b0 = _mm512_loadu_epi8( a + ( lda * 1 ) + kr );
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
+		b0 = _mm512_loadu_si512( a + ( lda * 1 ) + kr );
 
 		a01 = _mm512_unpacklo_epi32( a0, b0 );
 		a0 = _mm512_unpackhi_epi32( a0, b0 );
@@ -494,8 +508,8 @@ void packa_m2_k64_u8s8s32o32
 		b0 = _mm512_permutex2var_epi64( a01, selector1, a0 ); // a[0]
 		a01 = _mm512_permutex2var_epi64( a01, selector1_1, a0 ); // a[1]
 
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 2 ) + ( 0 ) ), b0 );
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 2 ) + ( 64 ) ) , a01 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 2 ) + ( 0 ) ), b0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 2 ) + ( 64 ) ) , a01 );
 	}
 }
 
@@ -512,9 +526,9 @@ void packa_m1_k64_u8s8s32o32
 	for ( dim_t kr = 0; kr < KC; kr += NR )
 	{
 		// Rearrange for vpdpbusd, read 1 row from A with 64 elements in each row.
-		a0 = _mm512_loadu_epi8( a + ( lda * 0 ) + kr );
+		a0 = _mm512_loadu_si512( a + ( lda * 0 ) + kr );
 
-		_mm512_storeu_epi64( pack_a_buffer_u8s8s32o32 + ( ( kr * 1 ) + ( 0 ) ), a0 );
+		_mm512_storeu_si512( pack_a_buffer_u8s8s32o32 + ( ( kr * 1 ) + ( 0 ) ), a0 );
 	}
 }
 #endif
diff --git a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb_amd512vnni.c b/kernels/zen4/lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c
similarity index 78%
rename from addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb_amd512vnni.c
rename to kernels/zen4/lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c
index d388c476e9..539386f5d0 100644
--- a/addon/aocl_gemm/kernels/u8s8s32/lpgemm_packb_amd512vnni.c
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_packb_amd512vnni.c
@@ -4,7 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2022-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -34,23 +34,12 @@
 
 #include <immintrin.h>
 #include <string.h>
-
 #include "blis.h"
-#include "lpgemm_packb.h"
 
-#define NR 64
+#ifdef BLIS_ADDON_LPGEMM
 
-void get_packb_nr64_u8s8s32o32_strides
-     (
-       dim_t* rs_b,
-       dim_t* cs_b
-     )
-{
-	*rs_b = NR * 4;
-	*cs_b = NR;
-}
+#define NR 64
 
-#ifdef BLIS_KERNELS_ZEN4
 void packb_nrlt16_u8s8s32o32
      (
        int8_t*       pack_b_buffer_u8s8s32o32,
@@ -131,11 +120,11 @@ void packb_nr64_u8s8s32o32
 		for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
 		{
 			// Rearrange for vpdpbusd, read 4 rows from B with 64 elements in each row.
-			a0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 0 ) ) + jc );
-			b0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 1 ) ) + jc );
-			c0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 2 ) ) + jc );
-			d0 = _mm512_loadu_epi8( b + ( ldb * ( kr + 3 ) ) + jc );
-			
+			a0 = _mm512_loadu_si512( b + ( ldb * ( kr + 0 ) ) + jc );
+			b0 = _mm512_loadu_si512( b + ( ldb * ( kr + 1 ) ) + jc );
+			c0 = _mm512_loadu_si512( b + ( ldb * ( kr + 2 ) ) + jc );
+			d0 = _mm512_loadu_si512( b + ( ldb * ( kr + 3 ) ) + jc );
+
 			a01 = _mm512_unpacklo_epi8( a0, b0 );
 			a0 = _mm512_unpackhi_epi8( a0, b0 );
 
@@ -158,37 +147,37 @@ void packb_nr64_u8s8s32o32
 			a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // b[1]
 			c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // b[3]
 
-			_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) ), a01 );
-			_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) ) , a0 );
-			_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 2 ) * NR ) ), c01 );
-			_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 3 ) * NR ) ), c0 );
+			_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 0 ) * NR ) ), a01 );
+			_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 1 ) * NR ) ) , a0 );
+			_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 2 ) * NR ) ), c01 );
+			_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( kr + 3 ) * NR ) ), c0 );
 		}
 		// Handle k remainder.
 		if ( k_partial_pieces > 0 )
 		{
 			if ( k_partial_pieces == 3 )
 			{
-				a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
-				b0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + jc );
-				c0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) + jc );
+				a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
+				b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 1 ) ) + jc );
+				c0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 2 ) ) + jc );
 				d0 = _mm512_setzero_si512();
 
 			}
 			else if( k_partial_pieces == 2 )
 			{
-				a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
-				b0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + jc );
+				a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
+				b0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 1 ) ) + jc );
 				c0 = _mm512_setzero_si512();
 				d0 = _mm512_setzero_si512();
 			}
 			else //k_partial_pieces == 1
 			{
-				a0 = _mm512_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
+				a0 = _mm512_loadu_si512( b + ( ldb * ( k_full_pieces + 0 ) ) + jc );
 				b0 = _mm512_setzero_si512();
 				c0 = _mm512_setzero_si512();
 				d0 = _mm512_setzero_si512();
 			}
-			
+
 			a01 = _mm512_unpacklo_epi8( a0, b0 );
 			a0 = _mm512_unpackhi_epi8( a0, b0 );
 
@@ -211,10 +200,10 @@ void packb_nr64_u8s8s32o32
 			a0 = _mm512_permutex2var_epi64( a0, selector2_1, c0 ); // b[1]
 			c0 = _mm512_permutex2var_epi64( b0, selector2_1, d0 ); // b[3]
 
-			_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) ), a01 );
-			_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) ) , a0 );
-			_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 2 ) * NR ) ), c01 );
-			_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 3 ) * NR ) ), c0 );	
+			_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 0 ) * NR ) ), a01 );
+			_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 1 ) * NR ) ) , a0 );
+			_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 2 ) * NR ) ), c01 );
+			_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( jc * KC_updated ) + ( ( k_full_pieces + 3 ) * NR ) ), c0 );	
 		}
 	}
 	
@@ -309,11 +298,11 @@ void packb_nr48_u8s8s32o32
 	for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
 	{
 		// Rearrange for vpdpbusd, read 4 rows from B with 32 elements in each row.
-		a0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 0 ) ) );
-		b0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 1 ) ) );
-		c0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 2 ) ) );
-		d0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 3 ) ) );
-		
+		a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 0 ) ) );
+		b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 1 ) ) );
+		c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 2 ) ) );
+		d0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 3 ) ) );
+
 		a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 );
 		a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 );
 
@@ -325,7 +314,7 @@ void packb_nr48_u8s8s32o32
 
 		d0_32 = _mm256_unpacklo_epi16( a0_32, c0_32 );
 		c01_32 = _mm256_unpackhi_epi16( a0_32, c0_32 );
-		
+
 		a0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x0 ); // 0 elem
 		c0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x3 ); // 2 elem
 		b0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x0 ); // 1 elem
@@ -337,14 +326,14 @@ void packb_nr48_u8s8s32o32
 		b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 );
 
 		// First 4x32 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
 
 		// Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row.
-		a0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 0 ) ) + ( 32 ) );
-		b0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 1 ) ) + ( 32 ) );
-		c0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 2 ) ) + ( 32 ) );
-		d0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 3 ) ) + ( 32 ) );
+		a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 0 ) ) + ( 32 ) );
+		b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 1 ) ) + ( 32 ) );
+		c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 2 ) ) + ( 32 ) );
+		d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 3 ) ) + ( 32 ) );
 
 		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
 		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
@@ -361,10 +350,10 @@ void packb_nr48_u8s8s32o32
 		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
-		
+
 		// Last 4x16 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm );
-		
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm );
+
 		// The 4th 16byte chunk will be ignored, since its not part of the original data,
 		// but is here due to the packing in 4 16byte chunks format.
 		kr_new += 3;
@@ -374,42 +363,42 @@ void packb_nr48_u8s8s32o32
 	{
 		if ( k_partial_pieces == 3 )
 		{
-			a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
-			b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) );
-			c0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) );
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) );
 			d0_32 = _mm256_setzero_si256();
-		
-			a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
-			b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) );
-			c0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) + ( 32 ) );
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) );
+			c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) + ( 32 ) );
 			d0_16 = _mm_setzero_si128();
 
 		}
 		else if( k_partial_pieces == 2 )
 		{
-			a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
-			b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) );
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
 			c0_32 = _mm256_setzero_si256();
 			d0_32 = _mm256_setzero_si256();
-		
-			a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
-			b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) );
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) + ( 32 ) );
 			c0_16 = _mm_setzero_si128();
 			d0_16 = _mm_setzero_si128();
 		}
 		else //k_partial_pieces == 1
 		{
-			a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
 			b0_32 = _mm256_setzero_si256();
 			c0_32 = _mm256_setzero_si256();
 			d0_32 = _mm256_setzero_si256();
-		
-			a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
+
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) + ( 32 ) );
 			b0_16 = _mm_setzero_si128();
 			c0_16 = _mm_setzero_si128();
 			d0_16 = _mm_setzero_si128();
 		}
-		
+
 		a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 );
 		a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 );
 
@@ -421,7 +410,7 @@ void packb_nr48_u8s8s32o32
 
 		d0_32 = _mm256_unpacklo_epi16( a0_32, c0_32 );
 		c01_32 = _mm256_unpackhi_epi16( a0_32, c0_32 );
-		
+
 		a0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x0 ); // 0 elem
 		c0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x3 ); // 2 elem
 		b0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x0 ); // 1 elem
@@ -433,8 +422,8 @@ void packb_nr48_u8s8s32o32
 		b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 );
 
 		// First 4x32 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
 
 		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
 		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
@@ -451,9 +440,9 @@ void packb_nr48_u8s8s32o32
 		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
-		
+
 		// Last 4x16 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 2 ) * NR ), a0_zmm );
 	}
 }
 
@@ -483,11 +472,11 @@ void packb_nr32_u8s8s32o32
 	for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
 	{
 		// Rearrange for vpdpbusd, read 4 rows from B with 32 elements in each row.
-		a0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 0 ) ) );
-		b0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 1 ) ) );
-		c0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 2 ) ) );
-		d0_32 = _mm256_loadu_epi8( b + ( ldb * ( kr + 3 ) ) );
-		
+		a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 0 ) ) );
+		b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 1 ) ) );
+		c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 2 ) ) );
+		d0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( kr + 3 ) ) );
+
 		a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 );
 		a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 );
 
@@ -499,7 +488,7 @@ void packb_nr32_u8s8s32o32
 
 		d0_32 = _mm256_unpacklo_epi16( a0_32, c0_32 );
 		c01_32 = _mm256_unpackhi_epi16( a0_32, c0_32 );
-		
+
 		a0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x0 ); // 0 elem
 		c0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x3 ); // 2 elem
 		b0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x0 ); // 1 elem
@@ -511,8 +500,8 @@ void packb_nr32_u8s8s32o32
 		b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 );
 
 		// First 4x32 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
 
 		// The 3rd and 4th 16byte chunk will be ignored, since its not part of the original data,
 		// but is here due to the packing in 4 16byte chunks format.
@@ -523,27 +512,27 @@ void packb_nr32_u8s8s32o32
 	{
 		if ( k_partial_pieces == 3 )
 		{
-			a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
-			b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) );
-			c0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) );
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) );
 			d0_32 = _mm256_setzero_si256();
 
 		}
 		else if( k_partial_pieces == 2 )
 		{
-			a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
-			b0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) );
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
 			c0_32 = _mm256_setzero_si256();
 			d0_32 = _mm256_setzero_si256();
 		}
 		else //k_partial_pieces == 1
 		{
-			a0_32 = _mm256_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
+			a0_32 = _mm256_maskz_loadu_epi8( 0xFFFFFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
 			b0_32 = _mm256_setzero_si256();
 			c0_32 = _mm256_setzero_si256();
 			d0_32 = _mm256_setzero_si256();
 		}
-		
+
 		a01_32 = _mm256_unpacklo_epi8( a0_32, b0_32 );
 		a0_32 = _mm256_unpackhi_epi8( a0_32, b0_32 );
 
@@ -555,7 +544,7 @@ void packb_nr32_u8s8s32o32
 
 		d0_32 = _mm256_unpacklo_epi16( a0_32, c0_32 );
 		c01_32 = _mm256_unpackhi_epi16( a0_32, c0_32 );
-		
+
 		a0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x0 ); // 0 elem
 		c0_32 = _mm256_shuffle_i32x4( b0_32,  a01_32, 0x3 ); // 2 elem
 		b0_32 = _mm256_shuffle_i32x4( d0_32,  c01_32, 0x0 ); // 1 elem
@@ -567,8 +556,8 @@ void packb_nr32_u8s8s32o32
 		b0_zmm = _mm512_inserti32x8( b0_zmm, d0_32, 0x1 );
 
 		// First 4x32 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 1 ) * NR ), b0_zmm );
 	}
 }
 
@@ -597,10 +586,10 @@ void packb_nr16_u8s8s32o32
 	for ( dim_t kr = 0; kr < k_full_pieces; kr += 4 )
 	{
 		// Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row.
-		a0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 0 ) ) );
-		b0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 1 ) ) );
-		c0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 2 ) ) );
-		d0_16 = _mm_loadu_epi8( b + ( ldb * ( kr + 3 ) ) );
+		a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 0 ) ) );
+		b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 1 ) ) );
+		c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 2 ) ) );
+		d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( kr + 3 ) ) );
 
 		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
 		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
@@ -617,10 +606,10 @@ void packb_nr16_u8s8s32o32
 		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
-		
+
 		// Last 4x16 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
-		
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+
 		// The 2nd, 3rd, and 4th 16byte chunk will be ignored, since its not part of the original data,
 		// but is here due to the packing in 4 16byte chunks format.
 		kr_new += 1;
@@ -630,22 +619,22 @@ void packb_nr16_u8s8s32o32
 	{
 		if ( k_partial_pieces == 3 )
 		{
-			a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
-			b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) );
-			c0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 2 ) ) );
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
+			c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 2 ) ) );
 			d0_16 = _mm_setzero_si128();
 
 		}
 		else if( k_partial_pieces == 2 )
 		{
-			a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
-			b0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 1 ) ) );
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 1 ) ) );
 			c0_16 = _mm_setzero_si128();
 			d0_16 = _mm_setzero_si128();
 		}
 		else //k_partial_pieces == 1
 		{
-			a0_16 = _mm_loadu_epi8( b + ( ldb * ( k_full_pieces + 0 ) ) );
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, b + ( ldb * ( k_full_pieces + 0 ) ) );
 			b0_16 = _mm_setzero_si128();
 			c0_16 = _mm_setzero_si128();
 			d0_16 = _mm_setzero_si128();
@@ -666,9 +655,9 @@ void packb_nr16_u8s8s32o32
 		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
-		
+
 		// Last 4x16 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
 	}
 }
 
@@ -708,10 +697,10 @@ void packb_nrlt16_u8s8s32o32
 		memcpy( buf3, ( b + ( ldb * ( kr + 3 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
 
 		// Rearrange for vpdpbusd, read 4 rows from B with next 16 elements in each row.
-		a0_16 = _mm_loadu_epi8( buf0 );
-		b0_16 = _mm_loadu_epi8( buf1 );
-		c0_16 = _mm_loadu_epi8( buf2 );
-		d0_16 = _mm_loadu_epi8( buf3 );
+		a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 );
+		b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 );
+		c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf2 );
+		d0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf3 );
 
 		a01_16 = _mm_unpacklo_epi8( a0_16, b0_16 );
 		a0_16 = _mm_unpackhi_epi8( a0_16, b0_16 );
@@ -728,10 +717,10 @@ void packb_nrlt16_u8s8s32o32
 		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
-		
+
 		// Last 4x16 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
-		
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+
 		// The 2nd, 3rd, and 4th 16byte chunk will be ignored, since its not part of the original data,
 		// but is here due to the packing in 4 16byte chunks format.
 		kr_new += 1;
@@ -745,9 +734,9 @@ void packb_nrlt16_u8s8s32o32
 			memcpy( buf1, ( b + ( ldb * ( k_full_pieces + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
 			memcpy( buf2, ( b + ( ldb * ( k_full_pieces + 2 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
 
-			a0_16 = _mm_loadu_epi8( buf0 );
-			b0_16 = _mm_loadu_epi8( buf1 );
-			c0_16 = _mm_loadu_epi8( buf2 );
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 );
+			c0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf2 );
 			d0_16 = _mm_setzero_si128();
 
 		}
@@ -756,8 +745,8 @@ void packb_nrlt16_u8s8s32o32
 			memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
 			memcpy( buf1, ( b + ( ldb * ( k_full_pieces + 1 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
 
-			a0_16 = _mm_loadu_epi8( buf0 );
-			b0_16 = _mm_loadu_epi8( buf1 );
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 );
+			b0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf1 );
 			c0_16 = _mm_setzero_si128();
 			d0_16 = _mm_setzero_si128();
 		}
@@ -765,7 +754,7 @@ void packb_nrlt16_u8s8s32o32
 		{
 			memcpy( buf0, ( b + ( ldb * ( k_full_pieces + 0 ) ) ), ( n0_partial_rem * sizeof( int8_t ) ) );
 
-			a0_16 = _mm_loadu_epi8( buf0 );
+			a0_16 = _mm_maskz_loadu_epi8( 0xFFFF, buf0 );
 			b0_16 = _mm_setzero_si128();
 			c0_16 = _mm_setzero_si128();
 			d0_16 = _mm_setzero_si128();
@@ -786,9 +775,9 @@ void packb_nrlt16_u8s8s32o32
 		a0_zmm = _mm512_inserti32x4( a0_zmm, a01_16, 0x1 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, d0_16, 0x2 );
 		a0_zmm = _mm512_inserti32x4( a0_zmm, c01_16, 0x3 );
-		
+
 		// Last 4x16 elements.
-		_mm512_storeu_epi64( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
+		_mm512_storeu_si512( pack_b_buffer_u8s8s32o32 + ( ( kr_new + 0 ) * NR ), a0_zmm );
 	}
 }
 #endif
diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h b/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h
new file mode 100644
index 0000000000..deb35e8e09
--- /dev/null
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_kern_macros.h
@@ -0,0 +1,222 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022-23, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_S32_KERN_MACROS_H
+#define LPGEMM_S32_KERN_MACROS_H
+
+#include "../gelu_avx512.h"
+#include "../math_utils_avx512.h"
+
+#define S32_BETA_FMA(reg,scratch1,scratch2) \
+	scratch1 = _mm512_mullo_epi32( scratch2, scratch1 ); \
+	reg = _mm512_add_epi32( scratch1, reg ); \
+
+#define S32_S32_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = _mm512_loadu_si512( c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ); \
+	S32_BETA_FMA(reg,scratch1,scratch2) \
+
+#define S32_S32_BETA_OP2(m_ir,m_ind,scratch1,scratch2) \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p0,m_ir,m_ind,0,scratch1,scratch2); \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p1,m_ir,m_ind,1,scratch1,scratch2); \
+
+#define S32_S32_BETA_OP3(m_ir,m_ind,scratch1,scratch2) \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p0,m_ir,m_ind,0,scratch1,scratch2); \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p1,m_ir,m_ind,1,scratch1,scratch2); \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p2,m_ir,m_ind,2,scratch1,scratch2); \
+
+#define S32_S32_BETA_OP4(m_ir,m_ind,scratch1,scratch2) \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p0,m_ir,m_ind,0,scratch1,scratch2); \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p1,m_ir,m_ind,1,scratch1,scratch2); \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p2,m_ir,m_ind,2,scratch1,scratch2); \
+	S32_S32_BETA_OP(c_int32_ ## m_ind ## p3,m_ir,m_ind,3,scratch1,scratch2); \
+
+// Downscale beta op.
+#define S8_S32_BETA_OP(reg,m_ir,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = \
+	_mm512_cvtepi8_epi32 \
+	( \
+	  _mm_maskz_loadu_epi8 \
+	  ( \
+		0xFFFF, \
+	    ( int8_t* )post_ops_attr.buf_downscale + \
+	    ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+	    post_ops_attr.post_op_c_j + ( n_ind * 16 ) \
+	  ) \
+	); \
+	S32_BETA_FMA(reg,scratch1,scratch2) \
+
+#define S8_S32_BETA_OP2(m_ir,m_ind,scratch1,scratch2) \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p0,m_ir,m_ind,0,scratch1,scratch2); \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p1,m_ir,m_ind,1,scratch1,scratch2); \
+
+#define S8_S32_BETA_OP3(m_ir,m_ind,scratch1,scratch2) \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p0,m_ir,m_ind,0,scratch1,scratch2); \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p1,m_ir,m_ind,1,scratch1,scratch2); \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p2,m_ir,m_ind,2,scratch1,scratch2); \
+
+#define S8_S32_BETA_OP4(m_ir,m_ind,scratch1,scratch2) \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p0,m_ir,m_ind,0,scratch1,scratch2); \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p1,m_ir,m_ind,1,scratch1,scratch2); \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p2,m_ir,m_ind,2,scratch1,scratch2); \
+	S8_S32_BETA_OP(c_int32_ ## m_ind ## p3,m_ir,m_ind,3,scratch1,scratch2); \
+
+// Default n < 16 beta macro
+#define S32_S32_BETA_OP_NLT16F(reg,buf_,scratch1,scratch2) \
+	scratch1 = _mm512_loadu_si512( buf_ ); \
+	S32_BETA_FMA(reg,scratch1,scratch2) \
+
+// Default n < 16 mask load beta macro
+#define S32_S32_BETA_OP_NLT16F_MASK(lmask,reg,m_ir,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = _mm512_maskz_loadu_epi32( lmask, c + ( rs_c * ( m_ir + m_ind ) ) + ( n_ind * 16 ) ); \
+	S32_BETA_FMA(reg,scratch1,scratch2) \
+
+// Downscale n < 16 mask load beta macro
+#define S8_S32_BETA_OP_NLT16F_MASK(lmask,reg,m_ind,n_ind,scratch1,scratch2) \
+	scratch1 = _mm512_cvtepi8_epi32 \
+	( \
+	  _mm_maskz_loadu_epi8 \
+	  ( \
+	    lmask, \
+	    ( int8_t* )post_ops_attr.buf_downscale + \
+	    ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+	    post_ops_attr.post_op_c_j + ( n_ind * 16 ) \
+	  ) \
+	); \
+	S32_BETA_FMA(reg,scratch1,scratch2) \
+
+// ReLU scale (Parametric ReLU):  f(x) = x, when x > 0 and f(x) = a*x when x <= 0
+#define RELU_SCALE_OP_S32_AVX512(reg) \
+	/* Generate indenx of elements <= 0.*/ \
+	relu_cmp_mask = _mm512_cmple_epi32_mask( reg, selector1 ); \
+ \
+	/* Apply scaling on for <= 0 elements.*/ \
+	reg = _mm512_mask_mullo_epi32( reg, relu_cmp_mask, reg, selector2 ); \
+
+// Downscale macro
+#define CVT_MULRND_CVT32(reg,selector) \
+	reg = \
+	_mm512_cvtps_epi32 \
+	( \
+	  _mm512_mul_round_ps \
+	  ( \
+		_mm512_cvtepi32_ps( reg ), \
+		( __m512 )selector, \
+		( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) \
+	  ) \
+	) \
+
+// Downscale store macro
+#define CVT_STORE_S32_S8(reg,m_ind,n_ind) \
+	_mm512_mask_cvtsepi32_storeu_epi8 \
+	( \
+	  ( int8_t* )post_ops_attr.buf_downscale + \
+	  ( post_ops_attr.rs_c_downscale * ( post_ops_attr.post_op_c_i + m_ind ) ) + \
+	  post_ops_attr.post_op_c_j + ( n_ind * 16 ), \
+	  mask_all1, reg \
+	) \
+
+// Downscale n < 16 macro
+#define CVT_MULRND_CVT32_LT16(reg,selector) \
+	reg = \
+	_mm512_cvtps_epi32 \
+	( \
+	  _mm512_mul_round_ps \
+	  ( \
+	    _mm512_cvtepi32_ps( reg ), \
+	    ( __m512 )selector, \
+	    ( _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) \
+	  ) \
+	) \
+
+/* TANH GeLU (x) = 0.5* x * (1 + tanh ( 0.797884 * ( x + ( 0.044715 * x^3 ) ) ) )  */ 
+#define GELU_TANH_S32_AVX512(reg, y, r, r2, x, z, dn, x_tanh, q) \
+\
+	y = _mm512_cvtepi32_ps( reg ); \
+\
+	GELU_TANH_F32_AVX512_DEF(y, r, r2, x, z, dn, x_tanh, q); \
+\
+	reg = _mm512_cvtps_epi32( y ); \
+
+/* ERF GeLU (x) = 0.5* x * (1 + erf (x * 0.707107 ))  */
+#define GELU_ERF_S32_AVX512(reg, y, r, x, x_erf) \
+\
+	y = _mm512_cvtepi32_ps( reg ); \
+\
+	GELU_ERF_F32_AVX512_DEF(y, r, x, x_erf); \
+\
+	reg = _mm512_cvtps_epi32( y ); \
+
+#define CLIP_S32_AVX512(reg, min, max) \
+\
+	reg = _mm512_min_epi32( _mm512_max_epi32( reg, min ), max ); \
+
+// Load helper macros.
+#define S32_GELU_LOAD1R_1C(temp_buf,offset,stride,reg_base) \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \
+
+#define S32_GELU_LOAD1R_2C(temp_buf,offset,stride,reg_base) \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \
+
+#define S32_GELU_LOAD1R_3C(temp_buf,offset,stride,reg_base) \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ), reg_base ## p2); \
+
+#define S32_GELU_LOAD1R_4C(temp_buf,offset,stride,reg_base) \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ), reg_base ## p0); \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ), reg_base ## p1); \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ), reg_base ## p2); \
+	_mm512_storeu_si512( ( temp_buf ) + ( ( 3 + offset ) * ( stride ) ), reg_base ## p3); \
+
+// Store helper macros.
+#define S32_GELU_STORE1R_1C(temp_buf,offset,stride,reg_base) \
+	reg_base ## p0 = _mm512_loadu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \
+
+#define S32_GELU_STORE1R_2C(temp_buf,offset,stride,reg_base) \
+	reg_base ## p0 = _mm512_loadu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \
+	reg_base ## p1 = _mm512_loadu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \
+
+#define S32_GELU_STORE1R_3C(temp_buf,offset,stride,reg_base) \
+	reg_base ## p0 = _mm512_loadu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \
+	reg_base ## p1 = _mm512_loadu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \
+	reg_base ## p2 = _mm512_loadu_si512( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ) ); \
+
+#define S32_GELU_STORE1R_4C(temp_buf,offset,stride,reg_base) \
+	reg_base ## p0 = _mm512_loadu_si512( ( temp_buf ) + ( ( 0 + offset ) * ( stride ) ) ); \
+	reg_base ## p1 = _mm512_loadu_si512( ( temp_buf ) + ( ( 1 + offset ) * ( stride ) ) ); \
+	reg_base ## p2 = _mm512_loadu_si512( ( temp_buf ) + ( ( 2 + offset ) * ( stride ) ) ); \
+	reg_base ## p3 = _mm512_loadu_si512( ( temp_buf ) + ( ( 3 + offset ) * ( stride ) ) ); \
+
+#endif // LPGEMM_S32_KERN_MACROS_H
diff --git a/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_memcpy_macros.h b/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_memcpy_macros.h
new file mode 100644
index 0000000000..fc5f0158b7
--- /dev/null
+++ b/kernels/zen4/lpgemm/u8s8s32/lpgemm_s32_memcpy_macros.h
@@ -0,0 +1,350 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+	- Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	- Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	- Neither the name(s) of the copyright holder(s) nor the names of its
+	  contributors may be used to endorse or promote products derived
+	  from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef LPGEMM_S32_MEMCPY_MACROS_H
+#define LPGEMM_S32_MEMCPY_MACROS_H
+
+// Copy macros to replace memcpy usage.
+//
+#define PASTE_S32_2TOKEN(tok,id) \
+	tok ## id
+
+#define PASTE_S32_3TOKEN(tok1,id,tok2) \
+	tok1 ## id ## tok2
+
+#define MEMCPY_S32_LT16_INIT(size) \
+	dim_t part8 = ( size ) >> 3; \
+	dim_t part4 = ( ( size ) - ( part8 << 3 ) ) >> 2; \
+	dim_t part4_rem = ( size ) % 4; \
+	dim_t frin_offset = 0; \
+
+#define MEMCPY_S32_LT16_REINIT(size) \
+	part8 = ( size ) >> 3; \
+	part4 = ( ( size ) - ( part8 << 3 ) ) >> 2; \
+	part4_rem = ( size ) % 4; \
+	frin_offset = 0; \
+
+// Copy for size < 4 for uint8 elements.
+#define MEMCPY_S32GM_LT4_UINT8(dst_,src_,size) \
+	{ \
+		uint8_t* dst = ( uint8_t* )( dst_ ); \
+		uint8_t* src = ( uint8_t* )( src_ ); \
+		if ( ( size ) == 1 ) \
+		{ \
+			dst[0] = src[0]; \
+		} \
+		else if ( ( size ) == 2 ) \
+		{ \
+			dst[0] = src[0]; \
+			dst[1] = src[1]; \
+		} \
+		else if ( ( size ) == 3 )\
+		{ \
+			dst[0] = src[0]; \
+			dst[1] = src[1]; \
+			dst[2] = src[2]; \
+		} \
+	} \
+
+// NR modulo 4 case, remainder items are assigned as 1, 2, or 3 elements.
+//
+// Remainder 1 case for MR=6,5,4,3,2,1
+#define MEMCPY_S32_LT16_REM4_PART1_4ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id0) )[frin_offset + fid] = ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id0) )[frin_offset + fid]; \
+	( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id1) )[frin_offset + fid] = ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id1) )[frin_offset + fid]; \
+	( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id2) )[frin_offset + fid] = ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id2) )[frin_offset + fid]; \
+	( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id3) )[frin_offset + fid] = ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id3) )[frin_offset + fid]; \
+
+#define MEMCPY_S32_LT16_REM4_PART1_2ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id0) )[frin_offset + fid] = ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id0) )[frin_offset + fid]; \
+	( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id1) )[frin_offset + fid] = ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id1) )[frin_offset + fid]; \
+
+#define MEMCPY_S32_LT16_REM4_PART1_1ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id0) )[frin_offset + fid] = ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id0) )[frin_offset + fid]; \
+
+#define MEMCPY_S32_LT16_REM4_PART1_6ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_4ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_2ELE_INT32(dst,src,SINGLE_TYPE,fid,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART1_5ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_4ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_1ELE_INT32(dst,src,SINGLE_TYPE,fid,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART1_3ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_2ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_1ELE_INT32(dst,src,SINGLE_TYPE,fid,id2,id5,id2,id3,id4,id5) \
+
+// Remainder 2 case for MR=6,5,4,3,2,1
+#define MEMCPY_S32_LT16_REM4_PART2_4ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	ds1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id0) ) + frin_offset ); \
+	sr1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id0) ) + frin_offset ); \
+	ds1[0] = sr1[0]; \
+	ds1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id1) ) + frin_offset ); \
+	sr1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id1) ) + frin_offset ); \
+	ds1[0] = sr1[0]; \
+	ds1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id2) ) + frin_offset ); \
+	sr1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id2) ) + frin_offset ); \
+	ds1[0] = sr1[0]; \
+	ds1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id3) ) + frin_offset ); \
+	sr1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id3) ) + frin_offset ); \
+	ds1[0] = sr1[0]; \
+
+#define MEMCPY_S32_LT16_REM4_PART2_2ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	ds1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id0) ) + frin_offset ); \
+	sr1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id0) ) + frin_offset ); \
+	ds1[0] = sr1[0]; \
+	ds1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id1) ) + frin_offset ); \
+	sr1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id1) ) + frin_offset ); \
+	ds1[0] = sr1[0]; \
+
+#define MEMCPY_S32_LT16_REM4_PART2_1ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	ds1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(dst,id0) ) + frin_offset ); \
+	sr1 = ( CAST_TYPE* )( ( ( SINGLE_TYPE* )PASTE_S32_2TOKEN(src,id0) ) + frin_offset ); \
+	ds1[0] = sr1[0]; \
+
+#define MEMCPY_S32_LT16_REM4_PART2_6ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_4ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_2ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART2_5ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_4ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_1ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART2_3ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_2ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_1ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id2,id5,id2,id3,id4,id5) \
+
+// Remainder 3 case for MR=6,5,4,3,2,1
+#define MEMCPY_S32_LT16_REM4_PART3_4ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_4ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_4ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART3_2ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_2ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_2ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART3_1ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART2_1ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART1_1ELE_INT32(dst,src,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART3_6ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART3_4ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART3_2ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART3_5ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART3_4ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART3_1ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_REM4_PART3_3ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART3_2ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_REM4_PART3_1ELE_INT32(dst,src,CAST_TYPE,SINGLE_TYPE,fid,id2,id5,id2,id3,id4,id5) \
+
+// Copy macro for NR' < 4 case.
+// Pre condition sizeof(CAST_TYPE) = 2 * sizeof(SINGLE_TYPE)
+#define MEMCPY_S32_LT16_REM4_INT32(dst,src,NRID,CAST_TYPE,SINGLE_TYPE) \
+	if ( part4_rem == 1 ) \
+	{ \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_REM4_PART1_,NRID,ELE_INT32)(dst,src,SINGLE_TYPE,0,0,1,2,3,4,5) \
+	} \
+	else if ( part4_rem == 2 ) \
+	{ \
+		CAST_TYPE* ds1; \
+		CAST_TYPE* sr1; \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_REM4_PART2_,NRID,ELE_INT32)(dst,src,CAST_TYPE,SINGLE_TYPE,0,1,2,3,4,5) \
+	} \
+	else if ( part4_rem == 3 )\
+	{ \
+		CAST_TYPE* ds1; \
+		CAST_TYPE* sr1; \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_REM4_PART3_,NRID,ELE_INT32)(dst,src,CAST_TYPE,SINGLE_TYPE,2,0,1,2,3,4,5) \
+	} \
+
+// int32_t 256 Store Load
+#define MEMCPY_S32_LT16_STR32_M256_4ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm256_storeu_epi32( PASTE_S32_2TOKEN(dst,id0), _mm256_loadu_epi32( PASTE_S32_2TOKEN(src,id0) ) ); \
+	_mm256_storeu_epi32( PASTE_S32_2TOKEN(dst,id1), _mm256_loadu_epi32( PASTE_S32_2TOKEN(src,id1) ) ); \
+	_mm256_storeu_epi32( PASTE_S32_2TOKEN(dst,id2), _mm256_loadu_epi32( PASTE_S32_2TOKEN(src,id2) ) ); \
+	_mm256_storeu_epi32( PASTE_S32_2TOKEN(dst,id3), _mm256_loadu_epi32( PASTE_S32_2TOKEN(src,id3) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M256_2ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm256_storeu_epi32( PASTE_S32_2TOKEN(dst,id0), _mm256_loadu_epi32( PASTE_S32_2TOKEN(src,id0) ) ); \
+	_mm256_storeu_epi32( PASTE_S32_2TOKEN(dst,id1), _mm256_loadu_epi32( PASTE_S32_2TOKEN(src,id1) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M256_1ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm256_storeu_epi32( PASTE_S32_2TOKEN(dst,id0), _mm256_loadu_epi32( PASTE_S32_2TOKEN(src,id0) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M256_6ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_4ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_2ELE_INT32(dst,src,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_STR32_M256_5ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_4ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_1ELE_INT32(dst,src,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_STR32_M256_3ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_2ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_1ELE_INT32(dst,src,id2,id5,id2,id3,id4,id5) \
+
+// int32_t 128 Store Load
+#define MEMCPY_S32_LT16_STR32_M128_4ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm_storeu_epi32( ( PASTE_S32_2TOKEN(dst,id0) + frin_offset ), _mm_loadu_epi32( ( PASTE_S32_2TOKEN(src,id0) + frin_offset ) ) ); \
+	_mm_storeu_epi32( ( PASTE_S32_2TOKEN(dst,id1) + frin_offset ), _mm_loadu_epi32( ( PASTE_S32_2TOKEN(src,id1) + frin_offset ) ) ); \
+	_mm_storeu_epi32( ( PASTE_S32_2TOKEN(dst,id2) + frin_offset ), _mm_loadu_epi32( ( PASTE_S32_2TOKEN(src,id2) + frin_offset ) ) ); \
+	_mm_storeu_epi32( ( PASTE_S32_2TOKEN(dst,id3) + frin_offset ), _mm_loadu_epi32( ( PASTE_S32_2TOKEN(src,id3) + frin_offset ) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M128_2ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm_storeu_epi32( ( PASTE_S32_2TOKEN(dst,id0) + frin_offset ), _mm_loadu_epi32( ( PASTE_S32_2TOKEN(src,id0) + frin_offset ) ) ); \
+	_mm_storeu_epi32( ( PASTE_S32_2TOKEN(dst,id1) + frin_offset ), _mm_loadu_epi32( ( PASTE_S32_2TOKEN(src,id1) + frin_offset ) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M128_1ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm_storeu_epi32( ( PASTE_S32_2TOKEN(dst,id0) + frin_offset ), _mm_loadu_epi32( ( PASTE_S32_2TOKEN(src,id0) + frin_offset ) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M128_6ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_4ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_2ELE_INT32(dst,src,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_STR32_M128_5ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_4ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_1ELE_INT32(dst,src,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_STR32_M128_3ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_2ELE_INT32(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_1ELE_INT32(dst,src,id2,id5,id2,id3,id4,id5) \
+
+// float 256 Store Load
+#define MEMCPY_S32_LT16_STR32_M256_4ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm256_storeu_ps( PASTE_S32_2TOKEN(dst,id0), _mm256_loadu_ps( PASTE_S32_2TOKEN(src,id0) ) ); \
+	_mm256_storeu_ps( PASTE_S32_2TOKEN(dst,id1), _mm256_loadu_ps( PASTE_S32_2TOKEN(src,id1) ) ); \
+	_mm256_storeu_ps( PASTE_S32_2TOKEN(dst,id2), _mm256_loadu_ps( PASTE_S32_2TOKEN(src,id2) ) ); \
+	_mm256_storeu_ps( PASTE_S32_2TOKEN(dst,id3), _mm256_loadu_ps( PASTE_S32_2TOKEN(src,id3) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M256_2ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm256_storeu_ps( PASTE_S32_2TOKEN(dst,id0), _mm256_loadu_ps( PASTE_S32_2TOKEN(src,id0) ) ); \
+	_mm256_storeu_ps( PASTE_S32_2TOKEN(dst,id1), _mm256_loadu_ps( PASTE_S32_2TOKEN(src,id1) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M256_1ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm256_storeu_ps( PASTE_S32_2TOKEN(dst,id0), _mm256_loadu_ps( PASTE_S32_2TOKEN(src,id0) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M256_6ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_4ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_2ELE_FLOAT(dst,src,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_STR32_M256_5ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_4ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_1ELE_FLOAT(dst,src,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_STR32_M256_3ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_2ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M256_1ELE_FLOAT(dst,src,id2,id5,id2,id3,id4,id5) \
+
+// float 128 Store Load
+#define MEMCPY_S32_LT16_STR32_M128_4ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm_storeu_ps( ( PASTE_S32_2TOKEN(dst,id0) + frin_offset ), _mm_loadu_ps( ( PASTE_S32_2TOKEN(src,id0) + frin_offset ) ) ); \
+	_mm_storeu_ps( ( PASTE_S32_2TOKEN(dst,id1) + frin_offset ), _mm_loadu_ps( ( PASTE_S32_2TOKEN(src,id1) + frin_offset ) ) ); \
+	_mm_storeu_ps( ( PASTE_S32_2TOKEN(dst,id2) + frin_offset ), _mm_loadu_ps( ( PASTE_S32_2TOKEN(src,id2) + frin_offset ) ) ); \
+	_mm_storeu_ps( ( PASTE_S32_2TOKEN(dst,id3) + frin_offset ), _mm_loadu_ps( ( PASTE_S32_2TOKEN(src,id3) + frin_offset ) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M128_2ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm_storeu_ps( ( PASTE_S32_2TOKEN(dst,id0) + frin_offset ), _mm_loadu_ps( ( PASTE_S32_2TOKEN(src,id0) + frin_offset ) ) ); \
+	_mm_storeu_ps( ( PASTE_S32_2TOKEN(dst,id1) + frin_offset ), _mm_loadu_ps( ( PASTE_S32_2TOKEN(src,id1) + frin_offset ) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M128_1ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	_mm_storeu_ps( ( PASTE_S32_2TOKEN(dst,id0) + frin_offset ), _mm_loadu_ps( ( PASTE_S32_2TOKEN(src,id0) + frin_offset ) ) ); \
+
+#define MEMCPY_S32_LT16_STR32_M128_6ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_4ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_2ELE_FLOAT(dst,src,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_STR32_M128_5ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_4ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_1ELE_FLOAT(dst,src,id4,id5,id2,id3,id4,id5) \
+
+#define MEMCPY_S32_LT16_STR32_M128_3ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_2ELE_FLOAT(dst,src,id0,id1,id2,id3,id4,id5) \
+	MEMCPY_S32_LT16_STR32_M128_1ELE_FLOAT(dst,src,id2,id5,id2,id3,id4,id5) \
+
+// Main macro for int32_t copy for lt 16 elems.
+#define MEMCPY_S32_LT16_INT32(NRID,CAST2_TYPE,SINGLE_TYPE,dst,src) \
+{ \
+	if ( part8 == 1 ) \
+	{ \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_STR32_M256_,NRID,ELE_INT32)(dst,src,0,1,2,3,4,5) \
+		frin_offset += 8; \
+	} \
+	if ( part4 == 1 ) \
+	{ \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_STR32_M128_,NRID,ELE_INT32)(dst,src,0,1,2,3,4,5) \
+		frin_offset += 4; \
+	} \
+	MEMCPY_S32_LT16_REM4_INT32(dst,src,NRID,CAST2_TYPE,SINGLE_TYPE) \
+} \
+
+// Reusing the int32_t based macros for int8 copy by modifying the types.
+// Main macro for int8_t copy for lt 16 elems.
+#define MEMCPY_S32_LT16_INT8(NRID,CAST8_TYPE,CAST4_TYPE,CAST2_TYPE,SINGLE_TYPE,dst,src) \
+{ \
+	if ( part8 == 1 ) \
+	{ \
+		CAST8_TYPE* ds1; \
+		CAST8_TYPE* sr1; \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_REM4_PART2_,NRID,ELE_INT32)(dst,src,CAST8_TYPE,SINGLE_TYPE,0,1,2,3,4,5) \
+		frin_offset += 8; \
+	} \
+	if ( part4 == 1 ) \
+	{ \
+		CAST4_TYPE* ds1; \
+		CAST4_TYPE* sr1; \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_REM4_PART2_,NRID,ELE_INT32)(dst,src,CAST4_TYPE,SINGLE_TYPE,0,1,2,3,4,5) \
+		frin_offset += 4; \
+	} \
+	MEMCPY_S32_LT16_REM4_INT32(dst,src,NRID,CAST2_TYPE,SINGLE_TYPE) \
+} \
+
+// Main macro for int32_t copy for lt 16 elems.
+#define MEMCPY_S32_LT16_FLOAT(NRID,CAST2_TYPE,SINGLE_TYPE,dst,src) \
+{ \
+	if ( part8 == 1 ) \
+	{ \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_STR32_M256_,NRID,ELE_FLOAT)(dst,src,0,1,2,3,4,5) \
+		frin_offset += 8; \
+	} \
+	if ( part4 == 1 ) \
+	{ \
+		PASTE_S32_3TOKEN(MEMCPY_S32_LT16_STR32_M128_,NRID,ELE_FLOAT)(dst,src,0,1,2,3,4,5) \
+		frin_offset += 4; \
+	} \
+	MEMCPY_S32_LT16_REM4_INT32(dst,src,NRID,CAST2_TYPE,SINGLE_TYPE) \
+} \
+
+#endif //LPGEMM_S32_MEMCPY_MACROS_H
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index 1dcb038397..d64b68a60b 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -69,7 +70,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		); \
 		return; \
 	} \
-	else if ( PASTEMAC(ch,eq0)( *alpha ) ) \
+	else if ( PASTEMAC(ch,eq1)( *alpha ) ) \
 	{ \
 		/* If alpha is one, use copyv. */ \
 \
@@ -79,7 +80,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		copyv_p \
 		( \
-		  BLIS_NO_CONJUGATE, \
+		  conjx, \
 		  n, \
 		  x, incx, \
 		  y, incy, \
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index c98f1b2503..2bee2eaca0 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -5,6 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -1720,3 +1721,250 @@ void PASTEMAC3(ch,opname,arch,suf) \
 
 INSERT_GENTFUNC_BASIC3( packm_24xk, 24, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       conj_t           conja, \
+       pack_t           schema, \
+       dim_t            cdim, \
+       dim_t            n, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t inca, inc_t lda, \
+       ctype*  restrict p,             inc_t ldp, \
+       cntx_t* restrict cntx \
+     ) \
+{ \
+    ctype* restrict kappa_cast = kappa; \
+    ctype* restrict alpha1     = a; \
+    ctype* restrict pi1        = p; \
+\
+	if ( cdim == mnr ) \
+	{ \
+		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+		{ \
+			if ( bli_is_conj( conja ) ) \
+			{ \
+				for ( dim_t k = n; k != 0; --k ) \
+				{ \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +16*inca), *(pi1 +16) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +17*inca), *(pi1 +17) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +18*inca), *(pi1 +18) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +19*inca), *(pi1 +19) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +20*inca), *(pi1 +20) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +21*inca), *(pi1 +21) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +22*inca), *(pi1 +22) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +23*inca), *(pi1 +23) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +24*inca), *(pi1 +24) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +25*inca), *(pi1 +25) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +26*inca), *(pi1 +26) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +27*inca), *(pi1 +27) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +28*inca), *(pi1 +28) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +29*inca), *(pi1 +29) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +30*inca), *(pi1 +30) ); \
+					PASTEMAC(ch,copyjs)( *(alpha1 +31*inca), *(pi1 +31) ); \
+\
+					alpha1 += lda; \
+					pi1    += ldp; \
+				} \
+			} \
+			else \
+			{ \
+				for ( dim_t k = n; k != 0; --k ) \
+				{ \
+					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
+					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +16*inca), *(pi1 +16) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +17*inca), *(pi1 +17) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +18*inca), *(pi1 +18) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +19*inca), *(pi1 +19) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +20*inca), *(pi1 +20) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +21*inca), *(pi1 +21) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +22*inca), *(pi1 +22) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +23*inca), *(pi1 +23) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +24*inca), *(pi1 +24) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +25*inca), *(pi1 +25) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +26*inca), *(pi1 +26) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +27*inca), *(pi1 +27) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +28*inca), *(pi1 +28) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +29*inca), *(pi1 +29) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +30*inca), *(pi1 +30) ); \
+					PASTEMAC(ch,copys)( *(alpha1 +31*inca), *(pi1 +31) ); \
+\
+					alpha1 += lda; \
+					pi1    += ldp; \
+				} \
+			} \
+		} \
+		else \
+		{ \
+			if ( bli_is_conj( conja ) ) \
+			{ \
+				for ( dim_t k = n; k != 0; --k ) \
+				{ \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +24*inca), *(pi1 +24) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +25*inca), *(pi1 +25) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +26*inca), *(pi1 +26) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +27*inca), *(pi1 +27) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +28*inca), *(pi1 +28) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +29*inca), *(pi1 +29) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +30*inca), *(pi1 +30) ); \
+					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +31*inca), *(pi1 +31) ); \
+\
+					alpha1 += lda; \
+					pi1    += ldp; \
+				} \
+			} \
+			else \
+			{ \
+				for ( dim_t k = n; k != 0; --k ) \
+				{ \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +24*inca), *(pi1 +24) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +25*inca), *(pi1 +25) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +26*inca), *(pi1 +26) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +27*inca), *(pi1 +27) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +28*inca), *(pi1 +28) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +29*inca), *(pi1 +29) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +30*inca), *(pi1 +30) ); \
+					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +31*inca), *(pi1 +31) ); \
+\
+					alpha1 += lda; \
+					pi1    += ldp; \
+				} \
+			} \
+		} \
+	} \
+	else /* if ( cdim < mnr ) */ \
+	{ \
+		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
+		( \
+		  0, \
+		  BLIS_NONUNIT_DIAG, \
+		  BLIS_DENSE, \
+		  ( trans_t )conja, \
+		  cdim, \
+		  n, \
+		  kappa, \
+		  a, inca, lda, \
+		  p,    1, ldp, \
+		  cntx, \
+		  NULL  \
+		); \
+\
+		/* if ( cdim < mnr ) */ \
+		{ \
+			const dim_t     i      = cdim; \
+			const dim_t     m_edge = mnr - cdim; \
+			const dim_t     n_edge = n_max; \
+			ctype* restrict p_cast = p; \
+			ctype* restrict p_edge = p_cast + (i  )*1; \
+\
+			PASTEMAC(ch,set0s_mxn) \
+			( \
+			  m_edge, \
+			  n_edge, \
+			  p_edge, 1, ldp  \
+			); \
+		} \
+	} \
+\
+	if ( n < n_max ) \
+	{ \
+		const dim_t     j      = n; \
+		const dim_t     m_edge = mnr; \
+		const dim_t     n_edge = n_max - n; \
+		ctype* restrict p_cast = p; \
+		ctype* restrict p_edge = p_cast + (j  )*ldp; \
+\
+		PASTEMAC(ch,set0s_mxn) \
+		( \
+		  m_edge, \
+		  n_edge, \
+		  p_edge, 1, ldp  \
+		); \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC3( packm_32xk, 32, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
+
diff --git a/sandbox/power10/POWER10.md b/sandbox/power10/POWER10.md
index a9b19c5a85..a5071159f3 100644
--- a/sandbox/power10/POWER10.md
+++ b/sandbox/power10/POWER10.md
@@ -67,5 +67,5 @@ Ensure that you have GCC 10.2 or greater.
 
 #### References
 
-* [bfloat16 wiki](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format)
-* [IEEE float16 wiki](https://en.wikipedia.org/wiki/Half-precision_floating-point_format)
\ No newline at end of file
+* [bfloat16 wiki]
+* [IEEE float16 wiki]
diff --git a/sandbox/power10/i4_macros.h b/sandbox/power10/i4_macros.h
index f4500bc934..b0767abcef 100644
--- a/sandbox/power10/i4_macros.h
+++ b/sandbox/power10/i4_macros.h
@@ -542,4 +542,4 @@
             col_m_order_2_kleft1(dest, matrix, (panel+ir/2), rs, cs); \
         } \
     }
-    
\ No newline at end of file
+    
diff --git a/sandbox/ref99/old/blx_gemm_front.c b/sandbox/ref99/old/blx_gemm_front.c
index 399f750a5c..8fa51d8e48 100644
--- a/sandbox/ref99/old/blx_gemm_front.c
+++ b/sandbox/ref99/old/blx_gemm_front.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2017 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2017-2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -98,7 +98,7 @@ void blx_gemm_front
 	}
 
 	{
-		// A sort of hack for communicating the desired pach schemas for A and
+		// A sort of hack for communicating the desired pack schemas for A and
 		// B to bli_gemm_cntl_create() (via bli_l3_thread_decorator() and
 		// bli_l3_cntl_create_if()). This allows us to access the schemas from
 		// the control tree, which hopefully reduces some confusion,
diff --git a/so_version b/so_version
index 436b8f7fa7..a7ff563cc4 100644
--- a/so_version
+++ b/so_version
@@ -1,2 +1,2 @@
 4
-0.0
+1.0
diff --git a/test/3/octave/subplot_tight.m b/test/3/octave/subplot_tight.m
index d84ea31888..665ab7bce0 100644
--- a/test/3/octave/subplot_tight.m
+++ b/test/3/octave/subplot_tight.m
@@ -1,126 +1,126 @@
-%
-% Copyright (c) 2016, Nikolay S.
-% All rights reserved.
-% 
-% Redistribution and use in source and binary forms, with or without
-% modification, are permitted provided that the following conditions are
-% met:
-% 
-%     * Redistributions of source code must retain the above copyright
-%       notice, this list of conditions and the following disclaimer.
-%     * Redistributions in binary form must reproduce the above copyright
-%       notice, this list of conditions and the following disclaimer in
-%       the documentation and/or other materials provided with the distribution
-% 
-% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-% POSSIBILITY OF SUCH DAMAGE.
-%
-
-function vargout=subplot_tight(m, n, p, margins, varargin)
-%% subplot_tight
-% A subplot function substitude with margins user tunabble parameter.
-%
-%% Syntax
-%  h=subplot_tight(m, n, p);
-%  h=subplot_tight(m, n, p, margins);
-%  h=subplot_tight(m, n, p, margins, subplotArgs...);
-%
-%% Description
-% Our goal is to grant the user the ability to define the margins between neighbouring
-%  subplots. Unfotrtunately Matlab subplot function lacks this functionality, and the
-%  margins between subplots can reach 40% of figure area, which is pretty lavish. While at
-%  the begining the function was implememnted as wrapper function for Matlab function
-%  subplot, it was modified due to axes del;etion resulting from what Matlab subplot
-%  detected as overlapping. Therefore, the current implmenetation makes no use of Matlab
-%  subplot function, using axes instead. This can be problematic, as axis and subplot
-%  parameters are quie different. Set isWrapper to "True" to return to wrapper mode, which
-%  fully supports subplot format.
-%
-%% Input arguments (defaults exist):
-%   margins- two elements vector [vertical,horizontal] defining the margins between
-%        neighbouring axes. Default value is 0.04
-%
-%% Output arguments
-%   same as subplot- none, or axes handle according to function call.
-%
-%% Issues & Comments
-%  - Note that if additional elements are used in order to be passed to subplot, margins
-%     parameter must be defined. For default margins value use empty element- [].
-%  - 
-%
-%% Example
-% close all;
-% img=imread('peppers.png');
-% figSubplotH=figure('Name', 'subplot');
-% figSubplotTightH=figure('Name', 'subplot_tight');
-% nElems=17;
-% subplotRows=ceil(sqrt(nElems)-1);
-% subplotRows=max(1, subplotRows);
-% subplotCols=ceil(nElems/subplotRows);
-% for iElem=1:nElems
-%    figure(figSubplotH);
-%    subplot(subplotRows, subplotCols, iElem);
-%    imshow(img);
-%    figure(figSubplotTightH);
-%    subplot_tight(subplotRows, subplotCols, iElem, [0.0001]);
-%    imshow(img);
-% end
-%
-%% See also
-%  - subplot
-%
-%% Revision history
-% First version: Nikolay S. 2011-03-29.
-% Last update:   Nikolay S. 2012-05-24.
-%
-% *List of Changes:*
-% 2012-05-24
-%  Non wrapping mode (based on axes command) added, to deal with an issue of disappearing
-%     subplots occuring with massive axes.
-
-%% Default params
-isWrapper=false;
-if (nargin<4) || isempty(margins)
-    margins=[0.04,0.04]; % default margins value- 4% of figure
-end
-if length(margins)==1
-    margins(2)=margins;
-end
-
-%note n and m are switched as Matlab indexing is column-wise, while subplot indexing is row-wise :(
-[subplot_col,subplot_row]=ind2sub([n,m],p);  
-
-
-height=(1-(m+1)*margins(1))/m; % single subplot height
-width=(1-(n+1)*margins(2))/n;  % single subplot width
-
-% note subplot suppors vector p inputs- so a merged subplot of higher dimentions will be created
-subplot_cols=1+max(subplot_col)-min(subplot_col); % number of column elements in merged subplot 
-subplot_rows=1+max(subplot_row)-min(subplot_row); % number of row elements in merged subplot   
-
-merged_height=subplot_rows*( height+margins(1) )- margins(1);   % merged subplot height
-merged_width= subplot_cols*( width +margins(2) )- margins(2);   % merged subplot width
-
-merged_bottom=(m-max(subplot_row))*(height+margins(1)) +margins(1); % merged subplot bottom position
-merged_left=min(subplot_col)*(width+margins(2))-width;              % merged subplot left position
-pos=[merged_left, merged_bottom, merged_width, merged_height];
-
-
-if isWrapper
-   h=subplot(m, n, p, varargin{:}, 'Units', 'Normalized', 'Position', pos);
-else
-   h=axes('Position', pos, varargin{:});
-end
-
-if nargout==1
-   vargout=h;
-end
+%
+% Copyright (c) 2016, Nikolay S.
+% All rights reserved.
+% 
+% Redistribution and use in source and binary forms, with or without
+% modification, are permitted provided that the following conditions are
+% met:
+% 
+%     * Redistributions of source code must retain the above copyright
+%       notice, this list of conditions and the following disclaimer.
+%     * Redistributions in binary form must reproduce the above copyright
+%       notice, this list of conditions and the following disclaimer in
+%       the documentation and/or other materials provided with the distribution
+% 
+% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+% POSSIBILITY OF SUCH DAMAGE.
+%
+
+function vargout=subplot_tight(m, n, p, margins, varargin)
+%% subplot_tight
+% A subplot function substitude with margins user tunabble parameter.
+%
+%% Syntax
+%  h=subplot_tight(m, n, p);
+%  h=subplot_tight(m, n, p, margins);
+%  h=subplot_tight(m, n, p, margins, subplotArgs...);
+%
+%% Description
+% Our goal is to grant the user the ability to define the margins between neighbouring
+%  subplots. Unfotrtunately Matlab subplot function lacks this functionality, and the
+%  margins between subplots can reach 40% of figure area, which is pretty lavish. While at
+%  the begining the function was implememnted as wrapper function for Matlab function
+%  subplot, it was modified due to axes del;etion resulting from what Matlab subplot
+%  detected as overlapping. Therefore, the current implmenetation makes no use of Matlab
+%  subplot function, using axes instead. This can be problematic, as axis and subplot
+%  parameters are quie different. Set isWrapper to "True" to return to wrapper mode, which
+%  fully supports subplot format.
+%
+%% Input arguments (defaults exist):
+%   margins- two elements vector [vertical,horizontal] defining the margins between
+%        neighbouring axes. Default value is 0.04
+%
+%% Output arguments
+%   same as subplot- none, or axes handle according to function call.
+%
+%% Issues & Comments
+%  - Note that if additional elements are used in order to be passed to subplot, margins
+%     parameter must be defined. For default margins value use empty element- [].
+%  - 
+%
+%% Example
+% close all;
+% img=imread('peppers.png');
+% figSubplotH=figure('Name', 'subplot');
+% figSubplotTightH=figure('Name', 'subplot_tight');
+% nElems=17;
+% subplotRows=ceil(sqrt(nElems)-1);
+% subplotRows=max(1, subplotRows);
+% subplotCols=ceil(nElems/subplotRows);
+% for iElem=1:nElems
+%    figure(figSubplotH);
+%    subplot(subplotRows, subplotCols, iElem);
+%    imshow(img);
+%    figure(figSubplotTightH);
+%    subplot_tight(subplotRows, subplotCols, iElem, [0.0001]);
+%    imshow(img);
+% end
+%
+%% See also
+%  - subplot
+%
+%% Revision history
+% First version: Nikolay S. 2011-03-29.
+% Last update:   Nikolay S. 2012-05-24.
+%
+% *List of Changes:*
+% 2012-05-24
+%  Non wrapping mode (based on axes command) added, to deal with an issue of disappearing
+%     subplots occuring with massive axes.
+
+%% Default params
+isWrapper=false;
+if (nargin<4) || isempty(margins)
+    margins=[0.04,0.04]; % default margins value- 4% of figure
+end
+if length(margins)==1
+    margins(2)=margins;
+end
+
+%note n and m are switched as Matlab indexing is column-wise, while subplot indexing is row-wise :(
+[subplot_col,subplot_row]=ind2sub([n,m],p);  
+
+
+height=(1-(m+1)*margins(1))/m; % single subplot height
+width=(1-(n+1)*margins(2))/n;  % single subplot width
+
+% note subplot suppors vector p inputs- so a merged subplot of higher dimentions will be created
+subplot_cols=1+max(subplot_col)-min(subplot_col); % number of column elements in merged subplot 
+subplot_rows=1+max(subplot_row)-min(subplot_row); % number of row elements in merged subplot   
+
+merged_height=subplot_rows*( height+margins(1) )- margins(1);   % merged subplot height
+merged_width= subplot_cols*( width +margins(2) )- margins(2);   % merged subplot width
+
+merged_bottom=(m-max(subplot_row))*(height+margins(1)) +margins(1); % merged subplot bottom position
+merged_left=min(subplot_col)*(width+margins(2))-width;              % merged subplot left position
+pos=[merged_left, merged_bottom, merged_width, merged_height];
+
+
+if isWrapper
+   h=subplot(m, n, p, varargin{:}, 'Units', 'Normalized', 'Position', pos);
+else
+   h=axes('Position', pos, varargin{:});
+end
+
+if nargout==1
+   vargout=h;
+end
diff --git a/test/studies/thunderx2/plot_thunderx2_perf.m b/test/studies/thunderx2/plot_thunderx2_perf.m
index 4712dc6946..6dde941234 100644
--- a/test/studies/thunderx2/plot_thunderx2_perf.m
+++ b/test/studies/thunderx2/plot_thunderx2_perf.m
@@ -110,4 +110,4 @@
     set(fig1,'PaperUnits','normalized');
     set(fig1,'PaperPosition', [0 0 1 1]);
     print(fig1, 'thunderx2-mt-56cores-20180830', '-dpdf')
-end
\ No newline at end of file
+end
diff --git a/test/sup/octave/subplot_tight.m b/test/sup/octave/subplot_tight.m
index d84ea31888..665ab7bce0 100644
--- a/test/sup/octave/subplot_tight.m
+++ b/test/sup/octave/subplot_tight.m
@@ -1,126 +1,126 @@
-%
-% Copyright (c) 2016, Nikolay S.
-% All rights reserved.
-% 
-% Redistribution and use in source and binary forms, with or without
-% modification, are permitted provided that the following conditions are
-% met:
-% 
-%     * Redistributions of source code must retain the above copyright
-%       notice, this list of conditions and the following disclaimer.
-%     * Redistributions in binary form must reproduce the above copyright
-%       notice, this list of conditions and the following disclaimer in
-%       the documentation and/or other materials provided with the distribution
-% 
-% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-% POSSIBILITY OF SUCH DAMAGE.
-%
-
-function vargout=subplot_tight(m, n, p, margins, varargin)
-%% subplot_tight
-% A subplot function substitude with margins user tunabble parameter.
-%
-%% Syntax
-%  h=subplot_tight(m, n, p);
-%  h=subplot_tight(m, n, p, margins);
-%  h=subplot_tight(m, n, p, margins, subplotArgs...);
-%
-%% Description
-% Our goal is to grant the user the ability to define the margins between neighbouring
-%  subplots. Unfotrtunately Matlab subplot function lacks this functionality, and the
-%  margins between subplots can reach 40% of figure area, which is pretty lavish. While at
-%  the begining the function was implememnted as wrapper function for Matlab function
-%  subplot, it was modified due to axes del;etion resulting from what Matlab subplot
-%  detected as overlapping. Therefore, the current implmenetation makes no use of Matlab
-%  subplot function, using axes instead. This can be problematic, as axis and subplot
-%  parameters are quie different. Set isWrapper to "True" to return to wrapper mode, which
-%  fully supports subplot format.
-%
-%% Input arguments (defaults exist):
-%   margins- two elements vector [vertical,horizontal] defining the margins between
-%        neighbouring axes. Default value is 0.04
-%
-%% Output arguments
-%   same as subplot- none, or axes handle according to function call.
-%
-%% Issues & Comments
-%  - Note that if additional elements are used in order to be passed to subplot, margins
-%     parameter must be defined. For default margins value use empty element- [].
-%  - 
-%
-%% Example
-% close all;
-% img=imread('peppers.png');
-% figSubplotH=figure('Name', 'subplot');
-% figSubplotTightH=figure('Name', 'subplot_tight');
-% nElems=17;
-% subplotRows=ceil(sqrt(nElems)-1);
-% subplotRows=max(1, subplotRows);
-% subplotCols=ceil(nElems/subplotRows);
-% for iElem=1:nElems
-%    figure(figSubplotH);
-%    subplot(subplotRows, subplotCols, iElem);
-%    imshow(img);
-%    figure(figSubplotTightH);
-%    subplot_tight(subplotRows, subplotCols, iElem, [0.0001]);
-%    imshow(img);
-% end
-%
-%% See also
-%  - subplot
-%
-%% Revision history
-% First version: Nikolay S. 2011-03-29.
-% Last update:   Nikolay S. 2012-05-24.
-%
-% *List of Changes:*
-% 2012-05-24
-%  Non wrapping mode (based on axes command) added, to deal with an issue of disappearing
-%     subplots occuring with massive axes.
-
-%% Default params
-isWrapper=false;
-if (nargin<4) || isempty(margins)
-    margins=[0.04,0.04]; % default margins value- 4% of figure
-end
-if length(margins)==1
-    margins(2)=margins;
-end
-
-%note n and m are switched as Matlab indexing is column-wise, while subplot indexing is row-wise :(
-[subplot_col,subplot_row]=ind2sub([n,m],p);  
-
-
-height=(1-(m+1)*margins(1))/m; % single subplot height
-width=(1-(n+1)*margins(2))/n;  % single subplot width
-
-% note subplot suppors vector p inputs- so a merged subplot of higher dimentions will be created
-subplot_cols=1+max(subplot_col)-min(subplot_col); % number of column elements in merged subplot 
-subplot_rows=1+max(subplot_row)-min(subplot_row); % number of row elements in merged subplot   
-
-merged_height=subplot_rows*( height+margins(1) )- margins(1);   % merged subplot height
-merged_width= subplot_cols*( width +margins(2) )- margins(2);   % merged subplot width
-
-merged_bottom=(m-max(subplot_row))*(height+margins(1)) +margins(1); % merged subplot bottom position
-merged_left=min(subplot_col)*(width+margins(2))-width;              % merged subplot left position
-pos=[merged_left, merged_bottom, merged_width, merged_height];
-
-
-if isWrapper
-   h=subplot(m, n, p, varargin{:}, 'Units', 'Normalized', 'Position', pos);
-else
-   h=axes('Position', pos, varargin{:});
-end
-
-if nargout==1
-   vargout=h;
-end
+%
+% Copyright (c) 2016, Nikolay S.
+% All rights reserved.
+% 
+% Redistribution and use in source and binary forms, with or without
+% modification, are permitted provided that the following conditions are
+% met:
+% 
+%     * Redistributions of source code must retain the above copyright
+%       notice, this list of conditions and the following disclaimer.
+%     * Redistributions in binary form must reproduce the above copyright
+%       notice, this list of conditions and the following disclaimer in
+%       the documentation and/or other materials provided with the distribution
+% 
+% THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+% AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+% IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+% ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+% LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+% CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+% SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+% INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+% CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+% ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+% POSSIBILITY OF SUCH DAMAGE.
+%
+
+function vargout=subplot_tight(m, n, p, margins, varargin)
+%% subplot_tight
+% A subplot function substitude with margins user tunabble parameter.
+%
+%% Syntax
+%  h=subplot_tight(m, n, p);
+%  h=subplot_tight(m, n, p, margins);
+%  h=subplot_tight(m, n, p, margins, subplotArgs...);
+%
+%% Description
+% Our goal is to grant the user the ability to define the margins between neighbouring
+%  subplots. Unfotrtunately Matlab subplot function lacks this functionality, and the
+%  margins between subplots can reach 40% of figure area, which is pretty lavish. While at
+%  the begining the function was implememnted as wrapper function for Matlab function
+%  subplot, it was modified due to axes del;etion resulting from what Matlab subplot
+%  detected as overlapping. Therefore, the current implmenetation makes no use of Matlab
+%  subplot function, using axes instead. This can be problematic, as axis and subplot
+%  parameters are quie different. Set isWrapper to "True" to return to wrapper mode, which
+%  fully supports subplot format.
+%
+%% Input arguments (defaults exist):
+%   margins- two elements vector [vertical,horizontal] defining the margins between
+%        neighbouring axes. Default value is 0.04
+%
+%% Output arguments
+%   same as subplot- none, or axes handle according to function call.
+%
+%% Issues & Comments
+%  - Note that if additional elements are used in order to be passed to subplot, margins
+%     parameter must be defined. For default margins value use empty element- [].
+%  - 
+%
+%% Example
+% close all;
+% img=imread('peppers.png');
+% figSubplotH=figure('Name', 'subplot');
+% figSubplotTightH=figure('Name', 'subplot_tight');
+% nElems=17;
+% subplotRows=ceil(sqrt(nElems)-1);
+% subplotRows=max(1, subplotRows);
+% subplotCols=ceil(nElems/subplotRows);
+% for iElem=1:nElems
+%    figure(figSubplotH);
+%    subplot(subplotRows, subplotCols, iElem);
+%    imshow(img);
+%    figure(figSubplotTightH);
+%    subplot_tight(subplotRows, subplotCols, iElem, [0.0001]);
+%    imshow(img);
+% end
+%
+%% See also
+%  - subplot
+%
+%% Revision history
+% First version: Nikolay S. 2011-03-29.
+% Last update:   Nikolay S. 2012-05-24.
+%
+% *List of Changes:*
+% 2012-05-24
+%  Non wrapping mode (based on axes command) added, to deal with an issue of disappearing
+%     subplots occuring with massive axes.
+
+%% Default params
+isWrapper=false;
+if (nargin<4) || isempty(margins)
+    margins=[0.04,0.04]; % default margins value- 4% of figure
+end
+if length(margins)==1
+    margins(2)=margins;
+end
+
+%note n and m are switched as Matlab indexing is column-wise, while subplot indexing is row-wise :(
+[subplot_col,subplot_row]=ind2sub([n,m],p);  
+
+
+height=(1-(m+1)*margins(1))/m; % single subplot height
+width=(1-(n+1)*margins(2))/n;  % single subplot width
+
+% note subplot suppors vector p inputs- so a merged subplot of higher dimentions will be created
+subplot_cols=1+max(subplot_col)-min(subplot_col); % number of column elements in merged subplot 
+subplot_rows=1+max(subplot_row)-min(subplot_row); % number of row elements in merged subplot   
+
+merged_height=subplot_rows*( height+margins(1) )- margins(1);   % merged subplot height
+merged_width= subplot_cols*( width +margins(2) )- margins(2);   % merged subplot width
+
+merged_bottom=(m-max(subplot_row))*(height+margins(1)) +margins(1); % merged subplot bottom position
+merged_left=min(subplot_col)*(width+margins(2))-width;              % merged subplot left position
+pos=[merged_left, merged_bottom, merged_width, merged_height];
+
+
+if isWrapper
+   h=subplot(m, n, p, varargin{:}, 'Units', 'Normalized', 'Position', pos);
+else
+   h=axes('Position', pos, varargin{:});
+end
+
+if nargout==1
+   vargout=h;
+end
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index a0cec45b92..7ce7034453 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2022, Advanced Micro Devices, Inc. All rights reserved.
+   Copyright (C) 2018 - 2023, Advanced Micro Devices, Inc. All rights reserved.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -209,6 +209,17 @@ void libblis_test_gemmtrsm_ukr_experiment
 	// Query a context.
 	cntx = bli_gks_query_cntx();
 
+	// If TRSM and GEMM have different blocksizes and blocksizes
+	// are changed in global cntx object, when GEMM and TRSM are
+	// called in parallel, blocksizes in global cntx object will
+	// not be correct
+	// to fix this a local copy of cntx is created, so that 
+	// overriding the blocksizes does not impact the global cntx
+	// object.
+	// This is a temporary fix, a better fix is to create a
+	// separate blocksz_trsm array in cntx.
+	cntx_t cntx_trsm = *cntx;
+
 #if defined(BLIS_FAMILY_AMDZEN) ||  defined(BLIS_FAMILY_ZEN4) 
 	/* Zen4 TRSM Fixme:
 	 *
@@ -222,9 +233,11 @@ void libblis_test_gemmtrsm_ukr_experiment
 	 * 
 	 * We need to revisit this when TRSM AVX-512 kernels are implemented.
 	 */  
-	if (bli_arch_query_id() == BLIS_ARCH_ZEN4)
+		if ( (bli_arch_query_id() == BLIS_ARCH_ZEN4)  &&
+			 ((dc_str[0] == 's') || (dc_str[0] == 'd') ||
+			  (dc_str[0] == 'S') || (dc_str[0] == 'D')) )
 	{
-		bli_zen4_override_trsm_blkszs(cntx);
+		bli_zen4_override_trsm_blkszs(&cntx_trsm);
 	}
 #endif
 
@@ -235,13 +248,13 @@ void libblis_test_gemmtrsm_ukr_experiment
 	k = libblis_test_get_dim_from_prob_size( op->dim_spec[0], p_cur );
 
 
-	m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, cntx );
-	n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, cntx );
+	m = bli_cntx_get_blksz_def_dt( datatype, BLIS_MR, &cntx_trsm );
+	n = bli_cntx_get_blksz_def_dt( datatype, BLIS_NR, &cntx_trsm );
 
 	// Also query PACKMR and PACKNR as the leading dimensions to ap and bp,
 	// respectively.
-	ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, cntx );
-	ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, cntx );
+	ldap = bli_cntx_get_blksz_max_dt( datatype, BLIS_MR, &cntx_trsm );
+	ldbp = bli_cntx_get_blksz_max_dt( datatype, BLIS_NR, &cntx_trsm);
 
 
 	// Store the register blocksizes so that the driver can retrieve the
@@ -361,10 +374,10 @@ void libblis_test_gemmtrsm_ukr_experiment
 	void* buf_bp = bli_obj_buffer( &bp );
 	bli_packm_init_pack( BLIS_INVERT_DIAG, BLIS_PACKED_ROW_PANELS,
 	                     BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
-	                     BLIS_MR, BLIS_KR, &a, &ap, cntx );
+	                     BLIS_MR, BLIS_KR, &a, &ap, &cntx_trsm );
 	bli_packm_init_pack( BLIS_NO_INVERT_DIAG, BLIS_PACKED_COL_PANELS,
 	                     BLIS_PACK_FWD_IF_UPPER, BLIS_PACK_FWD_IF_LOWER,
-	                     BLIS_KR, BLIS_NR, &b, &bp, cntx );
+	                     BLIS_KR, BLIS_NR, &b, &bp, &cntx_trsm );
 	bli_obj_set_buffer( buf_ap, &ap );
 	bli_obj_set_buffer( buf_bp, &bp );
 
@@ -378,8 +391,8 @@ void libblis_test_gemmtrsm_ukr_experiment
 	bli_obj_set_uplo( uploa, &ap );
 
 	// Pack the data from the source objects.
-	bli_packm_blk_var1( &a, &ap, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
-	bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
+	bli_packm_blk_var1( &a, &ap, &cntx_trsm, NULL, &BLIS_PACKM_SINGLE_THREADED );
+	bli_packm_blk_var1( &b, &bp, &cntx_trsm, NULL, &BLIS_PACKM_SINGLE_THREADED );
 
 	// Create subpartitions from the a and b panels.
 	bli_gemmtrsm_ukr_make_subparts( k, &ap, &bp,
@@ -402,13 +415,13 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 
 		// Re-pack (restore) the contents of b to bp.
 		//bli_packm_blk_var1( &b, &bp, &cntx, cntl_b, &BLIS_PACKM_SINGLE_THREADED );
-		bli_packm_blk_var1( &b, &bp, cntx, NULL, &BLIS_PACKM_SINGLE_THREADED );
+		bli_packm_blk_var1( &b, &bp, &cntx_trsm, NULL, &BLIS_PACKM_SINGLE_THREADED );
 
 		time = bli_clock();
 
 		libblis_test_gemmtrsm_ukr_impl( iface, side, &alpha,
 		                                &a1xp, &a11p, &bx1p, &b11p, &c11,
-		                                cntx );
+		                                &cntx_trsm );
 
 		time_min = bli_clock_min_diff( time_min, time );
 	}
@@ -464,19 +477,6 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 	bli_obj_free( &c11 );
 	bli_obj_free( &c11_save );
 
-#if defined(BLIS_FAMILY_AMDZEN) ||  defined(BLIS_FAMILY_ZEN4) 
-	/* Zen4 TRSM Fixme:
-	 *
-	 * We have overrding the block sizes at the start of this function
-	 * Since the context is created only once we need to ensure that the 
-	 * default block sizes are restored for the subsequent operations.
-	 */  
-	if (bli_arch_query_id() == BLIS_ARCH_ZEN4)
-	{
-		bli_zen4_restore_default_blkszs(cntx);
-	}
-#endif
-
 }
 
 
diff --git a/vendor/cpp/cblas.hh b/vendor/cpp/cblas.hh
index b656ed28e1..2caf93498e 100644
--- a/vendor/cpp/cblas.hh
+++ b/vendor/cpp/cblas.hh
@@ -291,6 +291,47 @@ cblas_axpy(
     cblas_zaxpy( n, &alpha, x, incx, y, incy );
 }
 
+//------------------------------------------------------------------------------
+inline void
+cblas_axpby(
+    int n, float alpha,
+    const float *x, int incx,
+    float       beta,
+    float       *y, int incy)
+{
+    cblas_saxpby(n, alpha, x, incx, beta, y, incy);
+}
+
+inline void
+cblas_axpby(
+    int n, double alpha,
+    const double *x, int incx,
+    double       beta,
+    double       *y, int incy)
+{
+    cblas_daxpby(n, alpha, x, incx, beta, y, incy);
+}
+
+inline void
+cblas_axpby(
+    int n, std::complex<float> alpha,
+    std::complex<float> const *x, int incx,
+    std::complex<float>       beta,
+    std::complex<float>*      y, int incy)
+{
+    cblas_caxpby(n, &alpha, x, incx, &beta, y, incy);
+}
+
+inline void
+cblas_axpby(
+    int n, std::complex<double> alpha,
+    std::complex<double> const *x, int incx,
+    std::complex<double>       beta,
+    std::complex<double>*      y, int incy)
+{
+    cblas_zaxpby(n, &alpha, x, incx, &beta, y, incy);
+}
+
 // -----------------------------------------------------------------------------
 inline float
 cblas_dot(
diff --git a/vendor/testcpp/test_nrm2.cc b/vendor/testcpp/test_nrm2.cc
index d29ec77788..24b96c94f2 100644
--- a/vendor/testcpp/test_nrm2.cc
+++ b/vendor/testcpp/test_nrm2.cc
@@ -1,100 +1,100 @@
-/*
-
-   BLISPP
-   C++ test driver for BLIS CPP nrm2 routine and reference blis nrm2 routine.
-
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <complex>
-#include <iostream>
-#include "blis.hh"
-#include "test.hh"
-
-using namespace blis;
-using namespace std;
-//#define PRINT
-#define N 2
-#define ALPHA 0.5
-
-#define TOLERANCE          0.0000001
-/*
- * Test application assumes matrices to be column major, non-transposed
- */
-template< typename T>
-void test_nrm2()
-{
-
-    T X[N];
-    T nrm2, nrm2_ref;
-    int n;
-    int incx;
-
-    n = N;
-    incx = 1;
-
-    if(is_same<T , float>::value)
-    {
-        X[0] =  0.14f;
-        X[1] =  -0.632f;
-        nrm2_ref = 0.647320631527f;
-    }
-    else if(is_same<T , double>::value)
-    {
-        X[0] =  0.696;
-        X[1] =  -0.804;
-        nrm2_ref = 1.06340584915;
-    }
-
-#ifdef PRINT
-    printvector(X, n,(char *) "Vector X after blis::nrm2");
-#endif
-    nrm2 = blis::nrm2<T>(
-            n,
-            X,
-            incx
-            );
-#ifdef PRINT
-    printf("Norm of a Vector %E  \n", nrm2);
-    printf("Ref Norm of a Vector %E  \n", nrm2_ref);
-#endif
-
-    if (fabs(nrm2 - nrm2_ref) > TOLERANCE) 
-        printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
-    else
-        printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
-}
-
-// -----------------------------------------------------------------------------
-int main( int argc, char** argv )
-{
-    test_nrm2<float>( );
-    test_nrm2<double>( );
-    return 0;
-
-}
+/*
+
+   BLISPP
+   C++ test driver for BLIS CPP nrm2 routine and reference blis nrm2 routine.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <complex>
+#include <iostream>
+#include "blis.hh"
+#include "test.hh"
+
+using namespace blis;
+using namespace std;
+//#define PRINT
+#define N 2
+#define ALPHA 0.5
+
+#define TOLERANCE          0.0000001
+/*
+ * Test application assumes matrices to be column major, non-transposed
+ */
+template< typename T>
+void test_nrm2()
+{
+
+    T X[N];
+    T nrm2, nrm2_ref;
+    int n;
+    int incx;
+
+    n = N;
+    incx = 1;
+
+    if(is_same<T , float>::value)
+    {
+        X[0] =  0.14f;
+        X[1] =  -0.632f;
+        nrm2_ref = 0.647320631527f;
+    }
+    else if(is_same<T , double>::value)
+    {
+        X[0] =  0.696;
+        X[1] =  -0.804;
+        nrm2_ref = 1.06340584915;
+    }
+
+#ifdef PRINT
+    printvector(X, n,(char *) "Vector X after blis::nrm2");
+#endif
+    nrm2 = blis::nrm2<T>(
+            n,
+            X,
+            incx
+            );
+#ifdef PRINT
+    printf("Norm of a Vector %E  \n", nrm2);
+    printf("Ref Norm of a Vector %E  \n", nrm2_ref);
+#endif
+
+    if (fabs(nrm2 - nrm2_ref) > TOLERANCE) 
+        printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
+    else
+        printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
+}
+
+// -----------------------------------------------------------------------------
+int main( int argc, char** argv )
+{
+    test_nrm2<float>( );
+    test_nrm2<double>( );
+    return 0;
+
+}
diff --git a/vendor/testcpp/test_rot.cc b/vendor/testcpp/test_rot.cc
index a2e3fb7086..8849dccb11 100644
--- a/vendor/testcpp/test_rot.cc
+++ b/vendor/testcpp/test_rot.cc
@@ -1,102 +1,102 @@
-/*
-
-   BLISPP
-   C++ test driver for BLIS CPP rot routine and reference blis rot routine.
-
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <complex>
-#include <iostream>
-#include "blis.hh"
-#include "test.hh"
-
-using namespace blis;
-using namespace std;
-//#define PRINT
-#define N 1
-
-/*
- * Test application assumes matrices to be column major, non-transposed
- */
-template< typename T>
-void test_rot()
-{
-
-    T c, s;
-    T X[N], X_ref[N];
-    T Y[N], Y_ref[N];
-    int n;
-    int incx, incy;
-    
-    n = N;
-    incx = 1;
-    incy = 1;
-    if(is_same<T , float>::value){
-        c = -1.0f;
-        s = 0.0f;
-        X[0] = { -0.314f };
-        Y[0] = { -0.406f };
-        X_ref[0] = { 0.314f };
-        Y_ref[0] = { 0.406f }; 
-    }else{
-        c = -1;
-        s = 0;
-        X[0] = { -0.176  };
-        Y[0] = { -0.165  };
-        X_ref[0] = { 0.176 };
-        Y_ref[0] = { 0.165 };
-    }
-
-#ifdef PRINT
-    printvector(X, n, (char *)"Before blis::rot\nVector X");
-    printvector(Y, n, (char *)"Vector Y");
-#endif
-    blis::rot<T>( N, X, incx, Y, incy, c, s);
-#ifdef PRINT
-    printvector(X, n, (char *)"After blis::rot\nVector X");
-    printvector(Y, n, (char *) "Vector Y");
-    printvector(X, n, (char *) "Expected Output from blis::rot\nVector X");
-    printvector(Y, n, (char *)"Vector Y");
-#endif
-
-    if((computeErrorV(incx, incx , n, X, X_ref )==1) || (computeErrorV(incy, incy , n, Y, Y_ref )==1))
-         printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
-    else
-         printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
-
-}
-
-// -----------------------------------------------------------------------------
-int main( int argc, char** argv )
-{
-    test_rot<float>( );
-    test_rot<double>( );
-    return 0;
-
-}
+/*
+
+   BLISPP
+   C++ test driver for BLIS CPP rot routine and reference blis rot routine.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <complex>
+#include <iostream>
+#include "blis.hh"
+#include "test.hh"
+
+using namespace blis;
+using namespace std;
+//#define PRINT
+#define N 1
+
+/*
+ * Test application assumes matrices to be column major, non-transposed
+ */
+template< typename T>
+void test_rot()
+{
+
+    T c, s;
+    T X[N], X_ref[N];
+    T Y[N], Y_ref[N];
+    int n;
+    int incx, incy;
+    
+    n = N;
+    incx = 1;
+    incy = 1;
+    if(is_same<T , float>::value){
+        c = -1.0f;
+        s = 0.0f;
+        X[0] = { -0.314f };
+        Y[0] = { -0.406f };
+        X_ref[0] = { 0.314f };
+        Y_ref[0] = { 0.406f }; 
+    }else{
+        c = -1;
+        s = 0;
+        X[0] = { -0.176  };
+        Y[0] = { -0.165  };
+        X_ref[0] = { 0.176 };
+        Y_ref[0] = { 0.165 };
+    }
+
+#ifdef PRINT
+    printvector(X, n, (char *)"Before blis::rot\nVector X");
+    printvector(Y, n, (char *)"Vector Y");
+#endif
+    blis::rot<T>( N, X, incx, Y, incy, c, s);
+#ifdef PRINT
+    printvector(X, n, (char *)"After blis::rot\nVector X");
+    printvector(Y, n, (char *) "Vector Y");
+    printvector(X, n, (char *) "Expected Output from blis::rot\nVector X");
+    printvector(Y, n, (char *)"Vector Y");
+#endif
+
+    if((computeErrorV(incx, incx , n, X, X_ref )==1) || (computeErrorV(incy, incy , n, Y, Y_ref )==1))
+         printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
+    else
+         printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
+
+}
+
+// -----------------------------------------------------------------------------
+int main( int argc, char** argv )
+{
+    test_rot<float>( );
+    test_rot<double>( );
+    return 0;
+
+}
diff --git a/vendor/testcpp/test_rotg.cc b/vendor/testcpp/test_rotg.cc
index e11571ae3c..a99ef8c781 100644
--- a/vendor/testcpp/test_rotg.cc
+++ b/vendor/testcpp/test_rotg.cc
@@ -1,108 +1,108 @@
-/*
-
-   BLISPP
-   C++ test driver for BLIS CPP rotg routine and reference blis rotg routine.
-
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <complex>
-#include <iostream>
-#include "blis.hh"
-#include "test.hh"
-
-using namespace blis;
-using namespace std;
-//#define PRINT
-
-/*
- * Test application assumes matrices to be column major, non-transposed
- */
-template< typename T>
-void test_rotg()
-{
-
-    T a, b, c, s;
-    T a_ref, b_ref, c_ref, s_ref;
-
-   if(is_same<T , float>::value)
-   {
-      a = 1.0f;
-      b = 1.0f;
-      a_ref =  1.41421356237f;
-      b_ref =  1.41421356237f;
-      c_ref =  0.707106781187f;
-      s_ref =  0.707106781187f;
-   }else{
-      a = 1;
-      b = 0;
-      a_ref = 1;
-      b_ref = 0;
-      c_ref = 1;
-      s_ref = 0;
-   }
-
-#ifdef PRINT
-        cout<< "Before blis::rotg \na Value : " << a << "\n" ;
-        cout<< "b Value : " << b << "\n" ;
-#endif
-    blis::rotg<T>( 
-            &a,
-            &b,
-            &c,
-            &s
-            );
-
-#ifdef PRINT
-        cout<< "After blis::rotg \na Value : " << a << "\n" ;
-        cout<< "b Value : " << b << "\n" ;
-        cout<< "c Value : " << c << "\n" ;
-        cout<< "s Value : " << s << "\n" ;
-#endif
-
-#ifdef PRINT
-        cout<< "Expected Output\na Value : " << a_ref << "\n" ;
-        cout<< "b Value : " << b_ref << "\n" ;
-        cout<< "c Value : " << c_ref << "\n" ;
-        cout<< "s Value : " << s_ref << "\n" ;
-#endif
-     if( (a != a_ref ) || (b != b_ref ) || (c != c_ref ) || (s != s_ref ))
-        printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
-     else
-        printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
-
-}
-
-// -----------------------------------------------------------------------------
-int main( int argc, char** argv )
-{
-    test_rotg<float>( );
-    test_rotg<double>( );
-    return 0;
-
-}
+/*
+
+   BLISPP
+   C++ test driver for BLIS CPP rotg routine and reference blis rotg routine.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <complex>
+#include <iostream>
+#include "blis.hh"
+#include "test.hh"
+
+using namespace blis;
+using namespace std;
+//#define PRINT
+
+/*
+ * Test application assumes matrices to be column major, non-transposed
+ */
+template< typename T>
+void test_rotg()
+{
+
+    T a, b, c, s;
+    T a_ref, b_ref, c_ref, s_ref;
+
+   if(is_same<T , float>::value)
+   {
+      a = 1.0f;
+      b = 1.0f;
+      a_ref =  1.41421356237f;
+      b_ref =  1.41421356237f;
+      c_ref =  0.707106781187f;
+      s_ref =  0.707106781187f;
+   }else{
+      a = 1;
+      b = 0;
+      a_ref = 1;
+      b_ref = 0;
+      c_ref = 1;
+      s_ref = 0;
+   }
+
+#ifdef PRINT
+        cout<< "Before blis::rotg \na Value : " << a << "\n" ;
+        cout<< "b Value : " << b << "\n" ;
+#endif
+    blis::rotg<T>( 
+            &a,
+            &b,
+            &c,
+            &s
+            );
+
+#ifdef PRINT
+        cout<< "After blis::rotg \na Value : " << a << "\n" ;
+        cout<< "b Value : " << b << "\n" ;
+        cout<< "c Value : " << c << "\n" ;
+        cout<< "s Value : " << s << "\n" ;
+#endif
+
+#ifdef PRINT
+        cout<< "Expected Output\na Value : " << a_ref << "\n" ;
+        cout<< "b Value : " << b_ref << "\n" ;
+        cout<< "c Value : " << c_ref << "\n" ;
+        cout<< "s Value : " << s_ref << "\n" ;
+#endif
+     if( (a != a_ref ) || (b != b_ref ) || (c != c_ref ) || (s != s_ref ))
+        printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
+     else
+        printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
+
+}
+
+// -----------------------------------------------------------------------------
+int main( int argc, char** argv )
+{
+    test_rotg<float>( );
+    test_rotg<double>( );
+    return 0;
+
+}
diff --git a/vendor/testcpp/test_rotm.cc b/vendor/testcpp/test_rotm.cc
index aad4504b83..9ff793e500 100644
--- a/vendor/testcpp/test_rotm.cc
+++ b/vendor/testcpp/test_rotm.cc
@@ -1,106 +1,106 @@
-/*
-
-   BLISPP
-   C++ test driver for BLIS CPP rotm routine and reference blis rotm routine.
-
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <complex>
-#include <iostream>
-#include "blis.hh"
-#include "test.hh"
-
-using namespace blis;
-using namespace std;
-//#define PRINT
-#define N 1
-
-/*
- * Test application assumes matrices to be column major, non-transposed
- */
-template< typename T>
-void test_rotm()
-{
-
-    T X[N], X_ref[N];
-    T Y[N], Y_ref[N];
-    int n;
-    int incx, incy;
-    const  T P[5]  = { -1.0f, -4.44982e+03f, -15.5826f, 7.091334e+04f, 2.95912e+04f };
-    const  T P_double[5] = { 1.0, -1.244580625511e+03, 1.11154682624, 
-                            2.269384716089e-05, -0.0143785338883 };
-    n = N;
-    incx = 1;
-    incy = 1;
-    if(is_same<T , float>::value)
-    {
-      X[0] = { -0.034f };
-      Y[0] = { -0.56f };
-      X_ref[0] = { -3.956017e+04f };
-      Y_ref[0] = { -1.657054e+04f };
-    }else{
-       X[0] = { 0.84   };
-       Y[0] = { -0.711  };
-       X_ref[0] = { -1.046158725429e+03 };
-       Y_ref[0] = { -0.829776862405 };
-   }
-
-#ifdef PRINT
-    printvector(X, n, (char *)"Before blis::rot\nVector X");
-    printvector(Y, n, (char *)"Vector Y");
-#endif
-    if(is_same<T , float>::value)
-    {
-        blis::rotm<T>( N, X, incx, Y, incy, P);
-    }else{
-        blis::rotm<T>( N, X, incx, Y, incy, P_double);
-    }
-#ifdef PRINT
-    printvector(X, n, (char *)"After blis::rot\nVector X");
-    printvector(Y, n, (char *)"Vector Y");
-    printvector(X, n, (char *)"Expected Output from blis::rot\nVector X");
-    printvector(Y, n, (char *)"Vector Y");
-#endif
-
-    if((computeErrorV(incx, incx , n, X, X_ref )==1) 
-       || (computeErrorV(incy, incy , n, Y, Y_ref )==1))
-        printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
-    else
-        printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
-
-}
-
-// -----------------------------------------------------------------------------
-int main( int argc, char** argv )
-{
-    test_rotm<float>( );
-    test_rotm<double>( );
-    return 0;
-
-}
+/*
+
+   BLISPP
+   C++ test driver for BLIS CPP rotm routine and reference blis rotm routine.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <complex>
+#include <iostream>
+#include "blis.hh"
+#include "test.hh"
+
+using namespace blis;
+using namespace std;
+//#define PRINT
+#define N 1
+
+/*
+ * Test application assumes matrices to be column major, non-transposed
+ */
+template< typename T>
+void test_rotm()
+{
+
+    T X[N], X_ref[N];
+    T Y[N], Y_ref[N];
+    int n;
+    int incx, incy;
+    const  T P[5]  = { -1.0f, -4.44982e+03f, -15.5826f, 7.091334e+04f, 2.95912e+04f };
+    const  T P_double[5] = { 1.0, -1.244580625511e+03, 1.11154682624, 
+                            2.269384716089e-05, -0.0143785338883 };
+    n = N;
+    incx = 1;
+    incy = 1;
+    if(is_same<T , float>::value)
+    {
+      X[0] = { -0.034f };
+      Y[0] = { -0.56f };
+      X_ref[0] = { -3.956017e+04f };
+      Y_ref[0] = { -1.657054e+04f };
+    }else{
+       X[0] = { 0.84   };
+       Y[0] = { -0.711  };
+       X_ref[0] = { -1.046158725429e+03 };
+       Y_ref[0] = { -0.829776862405 };
+   }
+
+#ifdef PRINT
+    printvector(X, n, (char *)"Before blis::rot\nVector X");
+    printvector(Y, n, (char *)"Vector Y");
+#endif
+    if(is_same<T , float>::value)
+    {
+        blis::rotm<T>( N, X, incx, Y, incy, P);
+    }else{
+        blis::rotm<T>( N, X, incx, Y, incy, P_double);
+    }
+#ifdef PRINT
+    printvector(X, n, (char *)"After blis::rot\nVector X");
+    printvector(Y, n, (char *)"Vector Y");
+    printvector(X, n, (char *)"Expected Output from blis::rot\nVector X");
+    printvector(Y, n, (char *)"Vector Y");
+#endif
+
+    if((computeErrorV(incx, incx , n, X, X_ref )==1) 
+       || (computeErrorV(incy, incy , n, Y, Y_ref )==1))
+        printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
+    else
+        printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
+
+}
+
+// -----------------------------------------------------------------------------
+int main( int argc, char** argv )
+{
+    test_rotm<float>( );
+    test_rotm<double>( );
+    return 0;
+
+}
diff --git a/vendor/testcpp/test_rotmg.cc b/vendor/testcpp/test_rotmg.cc
index b2325bb241..a81119b7dc 100644
--- a/vendor/testcpp/test_rotmg.cc
+++ b/vendor/testcpp/test_rotmg.cc
@@ -1,137 +1,137 @@
-/*
-
-   BLISPP
-   C++ test driver for BLIS CPP rotmg routine and reference blis rotmg routine.
-
-   Copyright (C) 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include <complex>
-#include <iostream>
-#include "blis.hh"
-#include "test.hh"
-
-using namespace blis;
-using namespace std;
-//#define PRINT
-
-/*
- * Test application assumes matrices to be column major, non-transposed
- */
-template< typename T>
-void test_rotmg()
-{
-    T d1, d2, b1, b2;
-    T d1_ref, d2_ref, b1_ref;
-    T h[5] = { -999.0f, -999.1f, -999.2f, -999.3f, -999.4f };
-    T h_ref[5] = {-1.0f, 0.0f, 0.0f, 0.0f,0.0f};
-    T h_double[5] = { -999.0, -999.1, -999.2, -999.3, -999.4 };
-    T h_ref_double[5] = { 1, 0, 0, 0};
-
-   if(is_same<T , float>::value)
-   {
-        d1 = -1630.28519312f;
-        d2 = 44320.1964703f;
-        b1 = 1274.7681352f;
-        b2 = 0.983006912864f;
-        d1_ref= 0.0f;
-        d2_ref= 0.0f;
-        b1_ref= 0.0f;
-   }else{
-        d1 = -49.1978123005;
-        d2 = 0.228703451277;
-        b1 = 1.8901039144;
-        b2 = 7081.47754386;
-        d1_ref= 0;
-        d2_ref= 0;
-        b1_ref= 0;
-   }
-
-#ifdef PRINT
-    cout<< "Before blis::rotmg \nd1 Value : " << d1 << "\n" ;
-    cout<< "d2 Value : " << d2 << "\n" ;
-    cout<< "b1 Value : " << b1 << "\n" ;
-    printvector(h, 5,(char *) "param");
-#endif
-    if(is_same<T , float>::value)
-    {
-        blis::rotmg<T>( 
-                &d1,
-                &d2,
-                &b1,
-                b2,
-                h
-                );
-    }else{
-        blis::rotmg<T>( 
-         &d1,
-         &d2,
-         &b1,
-         b2,
-         h_double
-         );
-    }
-
-#ifdef PRINT
-    cout<< "After blis::rotmg \nd1 Value : " << d1 << "\n" ;
-    cout<< "d2 Value : " << d2 << "\n" ;
-    cout<< "b1 Value : " << b1 << "\n" ;
-    printvector(h, 5,(char *) "param");
-#endif
-
-#ifdef PRINT
-    cout<< "Expected Output from blis::rotmg \nd1 Value : " << d1_ref << "\n" ;
-    cout<< "d2 Value : " << d2_ref << "\n" ;
-    cout<< "b1 Value : " << b1_ref << "\n" ;
-    printvector(h_ref, 5,(char *) "param");
-#endif
-    if( (d1 != d1_ref ) || (d2 != d2_ref ) || (b1 != b1_ref ) )
-        printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
-    else if(is_same<T , float>::value){
-        if(computeErrorV(1, 1 , 5, h, h_ref )==1) 
-             printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
-        else
-             printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
-    }else if(is_same<T , float>::value){
-        if(computeErrorV(1, 1 , 5, h_double, h_ref_double )==1)
-            printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
-        else
-            printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
-    }else
-        printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
-
-}
-
-// -----------------------------------------------------------------------------
-int main( int argc, char** argv )
-{
-    test_rotmg<float>( );
-    test_rotmg<double>( );
-    return 0;
-
-}
+/*
+
+   BLISPP
+   C++ test driver for BLIS CPP rotmg routine and reference blis rotmg routine.
+
+   Copyright (C) 2019, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include <complex>
+#include <iostream>
+#include "blis.hh"
+#include "test.hh"
+
+using namespace blis;
+using namespace std;
+//#define PRINT
+
+/*
+ * Test application assumes matrices to be column major, non-transposed
+ */
+template< typename T>
+void test_rotmg()
+{
+    T d1, d2, b1, b2;
+    T d1_ref, d2_ref, b1_ref;
+    T h[5] = { -999.0f, -999.1f, -999.2f, -999.3f, -999.4f };
+    T h_ref[5] = {-1.0f, 0.0f, 0.0f, 0.0f,0.0f};
+    T h_double[5] = { -999.0, -999.1, -999.2, -999.3, -999.4 };
+    T h_ref_double[5] = { 1, 0, 0, 0};
+
+   if(is_same<T , float>::value)
+   {
+        d1 = -1630.28519312f;
+        d2 = 44320.1964703f;
+        b1 = 1274.7681352f;
+        b2 = 0.983006912864f;
+        d1_ref= 0.0f;
+        d2_ref= 0.0f;
+        b1_ref= 0.0f;
+   }else{
+        d1 = -49.1978123005;
+        d2 = 0.228703451277;
+        b1 = 1.8901039144;
+        b2 = 7081.47754386;
+        d1_ref= 0;
+        d2_ref= 0;
+        b1_ref= 0;
+   }
+
+#ifdef PRINT
+    cout<< "Before blis::rotmg \nd1 Value : " << d1 << "\n" ;
+    cout<< "d2 Value : " << d2 << "\n" ;
+    cout<< "b1 Value : " << b1 << "\n" ;
+    printvector(h, 5,(char *) "param");
+#endif
+    if(is_same<T , float>::value)
+    {
+        blis::rotmg<T>( 
+                &d1,
+                &d2,
+                &b1,
+                b2,
+                h
+                );
+    }else{
+        blis::rotmg<T>( 
+         &d1,
+         &d2,
+         &b1,
+         b2,
+         h_double
+         );
+    }
+
+#ifdef PRINT
+    cout<< "After blis::rotmg \nd1 Value : " << d1 << "\n" ;
+    cout<< "d2 Value : " << d2 << "\n" ;
+    cout<< "b1 Value : " << b1 << "\n" ;
+    printvector(h, 5,(char *) "param");
+#endif
+
+#ifdef PRINT
+    cout<< "Expected Output from blis::rotmg \nd1 Value : " << d1_ref << "\n" ;
+    cout<< "d2 Value : " << d2_ref << "\n" ;
+    cout<< "b1 Value : " << b1_ref << "\n" ;
+    printvector(h_ref, 5,(char *) "param");
+#endif
+    if( (d1 != d1_ref ) || (d2 != d2_ref ) || (b1 != b1_ref ) )
+        printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
+    else if(is_same<T , float>::value){
+        if(computeErrorV(1, 1 , 5, h, h_ref )==1) 
+             printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
+        else
+             printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
+    }else if(is_same<T , float>::value){
+        if(computeErrorV(1, 1 , 5, h_double, h_ref_double )==1)
+            printf("%s TEST FAIL\n" , __PRETTY_FUNCTION__);
+        else
+            printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
+    }else
+        printf("%s TEST PASS\n" , __PRETTY_FUNCTION__);
+
+}
+
+// -----------------------------------------------------------------------------
+int main( int argc, char** argv )
+{
+    test_rotmg<float>( );
+    test_rotmg<double>( );
+    return 0;
+
+}
diff --git a/version b/version
index fcdb2e109f..ee74734aa2 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-4.0.0
+4.1.0
diff --git a/windows/tests/README.txt b/windows/tests/README.txt
index 20fe1ccfc5..3522c68e00 100644
--- a/windows/tests/README.txt
+++ b/windows/tests/README.txt
@@ -1,22 +1,22 @@
-#BLIS check execution script
-
-Check execution script covers:
-        * checkblis
-        * checkblis-fast
-        * checkblis-md
-        * checkblis-salt
-
-##Requirements
-* Install latest version of python from python.org(preferably python 3.5 or greater)
-* Add python path and scripts path to the environment variable path
-
-#Copy all the files present in <src>/windows/tests directory to the directory where TestSuite.exe is present
-#Open the command prompt and execute the python script and provide an argument(check name)
-For example:
-python blis_check.py checkblis
-
-#Output can be seen on the command prompt
-
-Note:
-   New check execution can be added into the inputs.yaml in the below format
-   Ex: new_check: [input.general.filename,input.operations.filename]
+#BLIS check execution script
+
+Check execution script covers:
+        * checkblis
+        * checkblis-fast
+        * checkblis-md
+        * checkblis-salt
+
+##Requirements
+* Install latest version of python from python.org(preferably python 3.5 or greater)
+* Add python path and scripts path to the environment variable path
+
+#Copy all the files present in <src>/windows/tests directory to the directory where TestSuite.exe is present
+#Open the command prompt and execute the python script and provide an argument(check name)
+For example:
+python blis_check.py checkblis
+
+#Output can be seen on the command prompt
+
+Note:
+   New check execution can be added into the inputs.yaml in the below format
+   Ex: new_check: [input.general.filename,input.operations.filename]
diff --git a/windows/tests/blis_make.py b/windows/tests/blis_make.py
index c9a3a36a58..d72df904b4 100644
--- a/windows/tests/blis_make.py
+++ b/windows/tests/blis_make.py
@@ -1,107 +1,107 @@
-"""Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved"""
-import re
-import subprocess
-import yaml
-import sys
-import os
-
-
-class BlisCheck:
-
-    @staticmethod
-    def check_execution():
-
-        try:
-            with open(r'inputs.yaml') as file:
-                input_file = yaml.safe_load(file)
-                try:
-                    if (sys.argv[1] == '') or (sys.argv[1] == "--h") or (sys.argv[1] == "--help"):
-                        print("Below options are available \n")
-                        print("usage: python blis_make.py ", end='[')
-                        for var in input_file.keys():
-                            print(var, end=' | ')
-                        print('checkcpp | --h | --help]')
-                        sys.exit()
-                except IndexError:
-                    print("Below options are available \n")
-                    print("usage: python blis_make.py ", end='[')
-                    for var in input_file.keys():
-                        print(var, end=' | ')
-                    print('checkcpp | --h | --help]')
-                    sys.exit()
-                if sys.argv[1] == "check":
-                    which_check = 'checkblis-fast'
-                    command = "test_libblis.exe -g " + input_file['checkblis-fast'][0] + " -o "+input_file['checkblis-fast'][1]
-                    BlisCheck.test_checkblis(which_check, command)
-                    flag = 0
-                    which_check = 'check'
-                    for i in range(len(input_file[which_check])):
-                        #print(input_file[which_check][i])
-                        if '1' in input_file[which_check][i]:
-                            command = input_file[which_check][i]+' > out.'+input_file[which_check][i][:6]
-                            print("Running ", input_file[which_check][i], " (output to 'out."+input_file[which_check][i][:6]+"')")
-                        else:
-                            command = input_file[which_check][i]
-                            print("Running ", input_file[which_check][i], " (output to 'out."+input_file[which_check][i][:6]+"')")
-                        subprocess.check_call(command , shell=True)
-                        with open(r"out."+input_file[which_check][i][:6]) as out_file:
-                            strings = re.findall(r'FAIL', out_file.read())
-                            if strings:
-                                flag += 1
-
-                    if flag:
-                        print("At lease one BLAS test failed.")
-                        print("Please see out.* files for details")
-                    else:
-                        print("All BLAS tests passed")
-
-                    sys.exit()
-
-                elif sys.argv[1] == 'checkcpp':
-                    files = [f for f in os.listdir('.') if re.search('_blis.exe', f)]
-                    #print(files)
-                    for executable in files:
-                        subprocess.check_call(executable, shell=True)
-
-                    sys.exit()
-                else:
-                    general_file = input_file[sys.argv[1]][0]
-                    operations_file = input_file[sys.argv[1]][1]
-                    command = "test_libblis.exe -g " + general_file + " -o " + operations_file
-                    BlisCheck.test_checkblis(sys.argv[1], command)
-
-        except Exception as error:
-            print(error)
-
-    @staticmethod
-    def test_checkblis(which_check, command):
-        flag = 0
-        with open("output.testsuite.txt", 'w') as f:
-            if 'md' in which_check:
-                print("Running test_libblis.exe {} with output redirected to 'output.testsuite'".format(
-                    "(mixed dt)"))
-            elif len(which_check) > 9:
-                print("Running test_libblis.exe {} with output redirected to 'output.testsuite'".format(
-                    "(" + which_check[10:] + ")"))
-            else:
-                print("Running test_libblis.exe with output redirected to 'output.testsuite'")
-            process = subprocess.Popen(command, bufsize=1, universal_newlines=True, stdout=subprocess.PIPE,
-                                       stderr=subprocess.STDOUT)
-            for line in iter(process.stdout.readline, ''):
-                if "FAIL" in line:
-                    flag = + 1
-                f.write(line)
-                sys.stdout.flush()
-            process.wait()
-            errcode = process.returncode
-            f.close()
-            if flag:
-                print("At least one BLIS test failed. :( \n Please see output.testsuite for details.")
-            else:
-                print("All BLIS tests passed!")
-
-
-if __name__ == "__main__":
-    #which_check = sys.argv[1]
-    calling_Object = BlisCheck()
-    calling_Object.check_execution()
+"""Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved"""
+import re
+import subprocess
+import yaml
+import sys
+import os
+
+
+class BlisCheck:
+
+    @staticmethod
+    def check_execution():
+
+        try:
+            with open(r'inputs.yaml') as file:
+                input_file = yaml.safe_load(file)
+                try:
+                    if (sys.argv[1] == '') or (sys.argv[1] == "--h") or (sys.argv[1] == "--help"):
+                        print("Below options are available \n")
+                        print("usage: python blis_make.py ", end='[')
+                        for var in input_file.keys():
+                            print(var, end=' | ')
+                        print('checkcpp | --h | --help]')
+                        sys.exit()
+                except IndexError:
+                    print("Below options are available \n")
+                    print("usage: python blis_make.py ", end='[')
+                    for var in input_file.keys():
+                        print(var, end=' | ')
+                    print('checkcpp | --h | --help]')
+                    sys.exit()
+                if sys.argv[1] == "check":
+                    which_check = 'checkblis-fast'
+                    command = "test_libblis.exe -g " + input_file['checkblis-fast'][0] + " -o "+input_file['checkblis-fast'][1]
+                    BlisCheck.test_checkblis(which_check, command)
+                    flag = 0
+                    which_check = 'check'
+                    for i in range(len(input_file[which_check])):
+                        #print(input_file[which_check][i])
+                        if '1' in input_file[which_check][i]:
+                            command = input_file[which_check][i]+' > out.'+input_file[which_check][i][:6]
+                            print("Running ", input_file[which_check][i], " (output to 'out."+input_file[which_check][i][:6]+"')")
+                        else:
+                            command = input_file[which_check][i]
+                            print("Running ", input_file[which_check][i], " (output to 'out."+input_file[which_check][i][:6]+"')")
+                        subprocess.check_call(command , shell=True)
+                        with open(r"out."+input_file[which_check][i][:6]) as out_file:
+                            strings = re.findall(r'FAIL', out_file.read())
+                            if strings:
+                                flag += 1
+
+                    if flag:
+                        print("At lease one BLAS test failed.")
+                        print("Please see out.* files for details")
+                    else:
+                        print("All BLAS tests passed")
+
+                    sys.exit()
+
+                elif sys.argv[1] == 'checkcpp':
+                    files = [f for f in os.listdir('.') if re.search('_blis.exe', f)]
+                    #print(files)
+                    for executable in files:
+                        subprocess.check_call(executable, shell=True)
+
+                    sys.exit()
+                else:
+                    general_file = input_file[sys.argv[1]][0]
+                    operations_file = input_file[sys.argv[1]][1]
+                    command = "test_libblis.exe -g " + general_file + " -o " + operations_file
+                    BlisCheck.test_checkblis(sys.argv[1], command)
+
+        except Exception as error:
+            print(error)
+
+    @staticmethod
+    def test_checkblis(which_check, command):
+        flag = 0
+        with open("output.testsuite.txt", 'w') as f:
+            if 'md' in which_check:
+                print("Running test_libblis.exe {} with output redirected to 'output.testsuite'".format(
+                    "(mixed dt)"))
+            elif len(which_check) > 9:
+                print("Running test_libblis.exe {} with output redirected to 'output.testsuite'".format(
+                    "(" + which_check[10:] + ")"))
+            else:
+                print("Running test_libblis.exe with output redirected to 'output.testsuite'")
+            process = subprocess.Popen(command, bufsize=1, universal_newlines=True, stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT)
+            for line in iter(process.stdout.readline, ''):
+                if "FAIL" in line:
+                    flag = + 1
+                f.write(line)
+                sys.stdout.flush()
+            process.wait()
+            errcode = process.returncode
+            f.close()
+            if flag:
+                print("At least one BLIS test failed. :( \n Please see output.testsuite for details.")
+            else:
+                print("All BLIS tests passed!")
+
+
+if __name__ == "__main__":
+    #which_check = sys.argv[1]
+    calling_Object = BlisCheck()
+    calling_Object.check_execution()
diff --git a/windows/tests/inputs.yaml b/windows/tests/inputs.yaml
index 132f68df45..3173c90ae8 100644
--- a/windows/tests/inputs.yaml
+++ b/windows/tests/inputs.yaml
@@ -1,11 +1,11 @@
-# Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved
-
-checkblis: [input.general, input.operations]
-
-checkblis-fast: [input.general.fast, input.operations.fast]
-
-checkblis-md: [input.general.mixed, input.operations.mixed]
-
-checkblis-salt: [input.general.salt, input.operations.salt]
-
-check: ['cblat1.exe', 'dblat1.exe', 'sblat1.exe', 'zblat1.exe', 'cblat2.exe < ../../blastest/input/cblat2.in', 'dblat2.exe < ../../blastest/input/dblat2.in', 'sblat2.exe < ../../blastest/input/sblat2.in', 'zblat2.exe < ../../blastest/input/zblat2.in', 'cblat3.exe < ../../blastest/input/cblat3.in', 'dblat3.exe < ../../blastest/input/dblat3.in', 'sblat3.exe < ../../blastest/input/sblat3.in', 'zblat3.exe < ../../blastest/input/zblat3.in']
\ No newline at end of file
+# Copyright (C) 2020, Advanced Micro Devices, Inc. All Rights Reserved
+
+checkblis: [input.general, input.operations]
+
+checkblis-fast: [input.general.fast, input.operations.fast]
+
+checkblis-md: [input.general.mixed, input.operations.mixed]
+
+checkblis-salt: [input.general.salt, input.operations.salt]
+
+check: ['cblat1.exe', 'dblat1.exe', 'sblat1.exe', 'zblat1.exe', 'cblat2.exe < ../../blastest/input/cblat2.in', 'dblat2.exe < ../../blastest/input/dblat2.in', 'sblat2.exe < ../../blastest/input/sblat2.in', 'zblat2.exe < ../../blastest/input/zblat2.in', 'cblat3.exe < ../../blastest/input/cblat3.in', 'dblat3.exe < ../../blastest/input/dblat3.in', 'sblat3.exe < ../../blastest/input/sblat3.in', 'zblat3.exe < ../../blastest/input/zblat3.in']